提交 83698d31 编写于 作者: A Andy Polyakov

sha512-x86_64.pl: >5% better performance.

上级 6a40ebe8
master OpenHarmony-2.2-Beta2 OpenHarmony-2.3-Beta OpenHarmony-3.0-LTS OpenHarmony-3.1-API8-SDK-Public OpenHarmony-3.1-API9-SDK-Canary OpenHarmony-3.1-Beta OpenHarmony-3.1-Release OpenHarmony-3.2-Beta1 OpenHarmony-3.2-Beta2 OpenHarmony-3.2-Beta3 OpenHarmony-3.2-Beta4 OpenHarmony-3.2-Beta5 OpenHarmony-3.2-Release OpenHarmony-4.0-Beta1 OpenHarmony-4.0-Beta2 OpenHarmony-v2.2-Beta OpenHarmony_1.0.1_release OpenHarmony_filemanager_develop_20220505 OpenHarmony_filemanager_develop_20220614 add_issus_pr_template_for_master add_issus_pr_template_for_release feature_IDL_20220811 master_dy monthly_20220614 monthly_20220816 monthly_20221018 monthly_20230815 revert-merge-109-master weekly_20220105 weekly_20220111 weekly_20220118 weekly_20220125 weekly_20220201 weekly_20220208 weekly_20220215 weekly_20220222 weekly_20220301 weekly_20220406 weekly_20220412 weekly_20220419 weekly_20220426 weekly_20220503 weekly_20220510 weekly_20220524 weekly_20220531 weekly_20220607 weekly_20220614 weekly_20220621 weekly_20220628 weekly_20220705 weekly_20220712 weekly_20220719 weekly_20220726 weekly_20220802 weekly_20220809 weekly_20220816 weekly_20220823 weekly_20220830 weekly_20220906 weekly_20220913 weekly_20220920 weekly_20220927 weekly_20221004 weekly_20221011 weekly_20221018 weekly_20221025 weekly_20221101 weekly_20221108 weekly_20221115 weekly_20221122 weekly_20221129 weekly_20221206 weekly_20221213 weekly_20221220 weekly_20221227 weekly_20230103 weekly_20230110 weekly_20230117 weekly_20230124 weekly_20230131 weekly_20230207 weekly_20230214 weekly_20230221 weekly_20230228 weekly_20230307 weekly_20230314 weekly_20230321 weekly_20230328 weekly_20230404 weekly_20230411 weekly_20230418 weekly_20230425 weekly_20230502 weekly_20230509 weekly_20230516 weekly_20230523 weekly_20230530 weekly_20230606 weekly_20230613 weekly_20230619 weekly_20230626 weekly_20230627 weekly_20230704 weekly_20230712 weekly_20230725 weekly_20230801 weekly_20230808 weekly_20230815 weekly_20230822 weekly_20230829 OpenHarmony_v1.1.1-LTS OpenHarmony_release_v1.1.0 OpenHarmony-v4.0-Beta2 OpenHarmony-v4.0-Beta1 OpenHarmony-v3.2.2-Release OpenHarmony-v3.2.1-Release OpenHarmony-v3.2-Release OpenHarmony-v3.2-Beta5 OpenHarmony-v3.2-Beta4 OpenHarmony-v3.2-Beta3 OpenHarmony-v3.2-Beta2 OpenHarmony-v3.2-Beta1 OpenHarmony-v3.1.7-Release OpenHarmony-v3.1.6-Release OpenHarmony-v3.1.5-Release OpenHarmony-v3.1.4-Release OpenHarmony-v3.1.3-Release OpenHarmony-v3.1.2-Release OpenHarmony-v3.1.1-Release OpenHarmony-v3.1-Release OpenHarmony-v3.1-Beta OpenHarmony-v3.0.8-LTS OpenHarmony-v3.0.7-LTS OpenHarmony-v3.0.6-LTS OpenHarmony-v3.0.5-LTS OpenHarmony-v3.0.3-LTS OpenHarmony-v3.0.2-LTS OpenHarmony-v3.0.1-LTS OpenHarmony-v3.0-LTS OpenHarmony-v3.0-Beta1 OpenHarmony-v2.2-Beta2 OpenHarmony-v1.1.5-LTS OpenHarmony-v1.1.4-LTS OpenHarmony-v1.1.3-LTS OpenHarmony-v1.1.2-LTS OpenHarmony-v1.1.1-LTS OpenHarmony-2.0-Canary OpenHarmony-1.0
无相关合并请求
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
# ====================================================================
......@@ -39,6 +39,11 @@
# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
# sha256_block:-( This is presumably because 64-bit shifts/rotates
# apparently are not atomic instructions, but implemented in microcode.
#
# May 2012.
#
# Optimization including one of Pavel Semjanov's ideas, alternative
# Maj, resulted in >=5% improvement on most CPUs, 20% on P4.
$flavour = shift;
$output = shift;
......@@ -59,7 +64,7 @@ if ($output =~ /512/) {
$SZ=8;
@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
"%r8", "%r9", "%r10","%r11");
($T1,$a0,$a1,$a2)=("%r12","%r13","%r14","%r15");
($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=(1, 8, 7);
......@@ -71,7 +76,7 @@ if ($output =~ /512/) {
$SZ=4;
@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
"%r8d","%r9d","%r10d","%r11d");
($T1,$a0,$a1,$a2)=("%r12d","%r13d","%r14d","%r15d");
($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
......@@ -79,8 +84,7 @@ if ($output =~ /512/) {
$rounds=64;
}
$ctx="%rdi"; # 1st arg
$round="%rdi"; # zaps $ctx
$ctx="%rdi"; # 1st arg, zapped by $a3
$inp="%rsi"; # 2nd arg
$Tbl="%rbp";
......@@ -97,68 +101,71 @@ sub ROUND_00_15()
$code.=<<___;
ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
mov $f,$a2
mov $T1,`$SZ*($i&0xf)`(%rsp)
ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
xor $e,$a0
xor $g,$a2 # f^g
ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
add $h,$T1 # T1+=h
xor $a,$a1
add ($Tbl,$round,$SZ),$T1 # T1+=K[round]
and $e,$a2 # (f^g)&e
mov $b,$h
mov $T1,`$SZ*($i&0xf)`(%rsp)
add $h,$T1 # T1+=h
xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
add $a2,$T1 # T1+=Ch(e,f,g)
xor $e,$a0
xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
xor $c,$h # b^c
add ($Tbl),$T1 # T1+=K[round]
mov $a,$a2
xor $a,$a1
add $a2,$T1 # T1+=Ch(e,f,g)
mov $b,$a2
ror \$$Sigma1[0],$a0 # Sigma1(e)
and $a,$h # h=(b^c)&a
and $c,$a2 # b&c
xor $b,$a2 # a^b, b^c in next round
mov $b,$h
ror \$$Sigma0[0],$a1 # Sigma0(a)
and $a2,$a3
add $a0,$T1 # T1+=Sigma1(e)
add $a2,$h # h+=b&c (completes +=Maj(a,b,c)
xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
add $T1,$d # d+=T1
add $T1,$h # h+=T1
lea 1($round),$round # round++
___
$code.=<<___ if ($i>=15);
mov `$SZ*(($i+2)&0xf)`(%rsp),$a0
___
$code.=<<___;
lea $SZ($Tbl),$Tbl # round++
add $a1,$h # h+=Sigma0(a)
___
($a2,$a3) = ($a3,$a2);
}
sub ROUND_16_XX()
{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
#mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
mov `$SZ*(($i+14)&0xf)`(%rsp),$a1
mov $a0,$T1
ror \$`$sigma0[1]-$sigma0[0]`,$a0
mov $a1,$a2
ror \$`$sigma1[1]-$sigma1[0]`,$a1
ror \$`$sigma0[1]-$sigma0[0]`,$T1
xor $a0,$T1
shr \$$sigma0[2],$a0
xor $T1,$a0
shr \$$sigma0[2],$T1
ror \$$sigma0[0],$a0
xor $a2,$a1
shr \$$sigma1[2],$a2
ror \$$sigma0[0],$T1
xor $T1,$a0 # sigma0(X[(i+1)&0xf])
mov `$SZ*(($i+9)&0xf)`(%rsp),$T1
ror \$`$sigma1[1]-$sigma1[0]`,$a2
xor $a1,$a2
shr \$$sigma1[2],$a1
ror \$$sigma1[0],$a2
add $a0,$T1
xor $a0,$T1 # sigma0(X[(i+1)&0xf])
ror \$$sigma1[0],$a1
add `$SZ*(($i+9)&0xf)`(%rsp),$T1
xor $a2,$a1 # sigma1(X[(i+14)&0xf])
add `$SZ*($i&0xf)`(%rsp),$T1
......@@ -193,8 +200,6 @@ $func:
mov %r11,$_rsp # save copy of %rsp
.Lprologue:
lea $TABLE(%rip),$Tbl
mov $SZ*0($ctx),$A
mov $SZ*1($ctx),$B
mov $SZ*2($ctx),$C
......@@ -207,7 +212,9 @@ $func:
.align 16
.Lloop:
xor $round,$round
mov $B,$a3
lea $TABLE(%rip),$Tbl
xor $C,$a3 # magic
___
for($i=0;$i<16;$i++) {
$code.=" mov $SZ*$i($inp),$T1\n";
......@@ -228,8 +235,8 @@ ___
}
$code.=<<___;
cmp \$$rounds,$round
jb .Lrounds_16_xx
testl \$-1,($Tbl)
jnz .Lrounds_16_xx
mov $_ctx,$ctx
lea 16*$SZ($inp),$inp
......@@ -289,6 +296,8 @@ $TABLE:
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.long 0
.asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
___
} else {
$code.=<<___;
......@@ -335,6 +344,8 @@ $TABLE:
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
.long 0
.asciz "SHA512 block transfort for x86_64, CRYPTOGAMS by <appro\@openssl.org>
___
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册
反馈
建议
客服 返回
顶部