Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenHarmony
Third Party Openssl
提交
8626230a
T
Third Party Openssl
项目概览
OpenHarmony
/
Third Party Openssl
大约 1 年 前同步成功
通知
9
Star
18
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
Third Party Openssl
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
8626230a
编写于
2月 09, 2009
作者:
A
Andy Polyakov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
s390x assembler pack update.
上级
c23632d3
变更
7
展开全部
隐藏空白更改
内联
并排
Showing
7 changed file
with
799 addition
and
444 deletion
+799
-444
Configure
Configure
+1
-1
TABLE
TABLE
+3
-3
crypto/aes/asm/aes-s390x.pl
crypto/aes/asm/aes-s390x.pl
+535
-273
crypto/bn/asm/s390x-mont.pl
crypto/bn/asm/s390x-mont.pl
+55
-49
crypto/s390xcpuid.S
crypto/s390xcpuid.S
+83
-0
crypto/sha/asm/sha1-s390x.pl
crypto/sha/asm/sha1-s390x.pl
+87
-89
crypto/sha/asm/sha512-s390x.pl
crypto/sha/asm/sha512-s390x.pl
+35
-29
未找到文件。
Configure
浏览文件 @
8626230a
...
...
@@ -131,7 +131,7 @@ my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-
my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::void";
my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o::::::::::::void";
my $mips3_asm=":bn-mips3.o::::::::::::void";
my $s390x_asm="
:bn-s390x.o::aes_cbc.o
aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o:::::::void";
my $s390x_asm="
s390xcpuid.o:bn-s390x.o s390x-mont.o::
aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o:::::::void";
my $armv4_asm=":bn_asm.o armv4-mont.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::void";
my $ppc32_asm="ppccpuid.o:bn-ppc.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::";
my $ppc64_asm="ppccpuid.o:bn-ppc.o ppc-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::";
...
...
TABLE
浏览文件 @
8626230a
...
...
@@ -3542,10 +3542,10 @@ $thread_cflag = -D_REENTRANT
$sys_id =
$lflags = -ldl
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL
$cpuid_obj =
$bn_obj = bn-s390x.o
$cpuid_obj =
s390xcpuid.o
$bn_obj = bn-s390x.o
s390x-mont.o
$des_obj =
$aes_obj = aes
_cbc.o aes
-s390x.o
$aes_obj = aes-s390x.o
$bf_obj =
$md5_obj =
$sha1_obj = sha1-s390x.o sha256-s390x.o sha512-s390x.o
...
...
crypto/aes/asm/aes-s390x.pl
浏览文件 @
8626230a
此差异已折叠。
点击以展开。
crypto/bn/asm/s390x-mont.pl
浏览文件 @
8626230a
...
...
@@ -27,6 +27,11 @@
# module performance by implementing dedicated squaring code-path and
# possibly by unrolling loops...
# January 2009.
#
# Reschedule to minimize/avoid Address Generation Interlock hazard,
# make inner loops counter-based.
$mn0
=
"
%r0
";
$num
=
"
%r1
";
...
...
@@ -47,7 +52,7 @@ $nhi="%r10";
$nlo
=
"
%r11
";
$AHI
=
"
%r12
";
$NHI
=
"
%r13
";
$
fp
=
"
%r14
";
$
count
=
"
%r14
";
$sp
=
"
%r15
";
$code
.=
<<___;
...
...
@@ -57,44 +62,46 @@ $code.=<<___;
bn_mul_mont:
lgf $num,164($sp) # pull $num
sla $num,3 # $num to enumerate bytes
la $rp,0($num,$rp) # pointers to point at the vectors' ends
la $ap,0($num,$ap)
la $bp,0($num,$bp)
la $np,0($num,$np)
stmg %r2,%r15,16($sp)
cghi $num,16 #
lghi %r2,0 #
blr %r14 # if($num<16) return 0;
cghi $num,128 #
bhr %r14 # if($num>128) return 0;
lcgr $num,$num # -$num
lghi $rp,-160-8 # leave room for carry bit
lcgr $j,$num # -$num
lgr %r0,$sp
lgr $fp,$sp
aghi $fp,-160-8 # leave room for carry bit
la $sp,0($num,$fp) # alloca
stg %r0,0($sp)
aghi $fp,160-8 # $fp to point at tp[$num-1]
la $rp,0($rp,$sp)
la $sp,0($j,$rp) # alloca
stg %r0,0($sp) # back chain
la $bp,0($num,$bp) # restore $bp
sra $num,3 # restore $num
la $bp,0($j,$bp) # restore $bp
ahi $num,-1 # adjust $num for inner loop
lg $n0,0($n0) # pull n0
lg $bi,0($bp)
lg $alo,0($
num,$
ap)
lg $alo,0($ap)
mlgr $ahi,$bi # ap[0]*bp[0]
lgr $AHI,$ahi
lgr $mn0,$alo # "tp[0]"*n0
msgr $mn0,$n0
lg $nlo,0($n
um,$np)
#
lg $nlo,0($n
p)
#
mlgr $nhi,$mn0 # np[0]*m1
algr $nlo,$alo # +="tp[0]"
lghi $NHI,0
alcgr $NHI,$nhi
lgr $j,$num
aghi $j,8 # j=1
la $j,8(%r0) # j=1
lr $count,$num
.align 16
.L1st:
lg $alo,0($j,$ap)
mlgr $ahi,$bi # ap[j]*bp[0]
...
...
@@ -110,43 +117,45 @@ bn_mul_mont:
algr $nlo,$alo
alcgr $NHI,$nhi
stg $nlo,
0($j,$f
p) # tp[j-1]=
aghi $j,8
# j++
jnz
.L1st
stg $nlo,
160-8($j,$s
p) # tp[j-1]=
la $j,8($j)
# j++
brct $count,
.L1st
algr $NHI,$AHI
lghi $AHI,0
alcgr $AHI,$AHI # upmost overflow bit
stg $NHI,
0($f
p)
stg $AHI,
8($f
p)
stg $NHI,
160-8($j,$s
p)
stg $AHI,
160($j,$s
p)
la $bp,8($bp) # bp++
.Louter:
lg $bi,0($bp) # bp[i]
lg $alo,0($
num,$
ap)
lg $alo,0($ap)
mlgr $ahi,$bi # ap[0]*bp[i]
alg $alo,
8($num,$fp)
# +=tp[0]
alg $alo,
160($sp)
# +=tp[0]
lghi $AHI,0
alcgr $AHI,$ahi
lgr $mn0,$alo
msgr $mn0,$n0
# tp[0]*n0
msgr $mn0,$n0 # tp[0]*n0
lg $nlo,0($n
um,$np)
# np[0]
lg $nlo,0($n
p)
# np[0]
mlgr $nhi,$mn0 # np[0]*m1
algr $nlo,$alo # +="tp[0]"
lghi $NHI,0
alcgr $NHI,$nhi
lgr $j,$num
aghi $j,8 # j=1
la $j,8(%r0) # j=1
lr $count,$num
.align 16
.Linner:
lg $alo,0($j,$ap)
mlgr $ahi,$bi # ap[j]*bp[i]
algr $alo,$AHI
lghi $AHI,0
alcgr $ahi,$AHI
alg $alo,
8($j,$fp)
# +=tp[j]
alg $alo,
160($j,$sp)
# +=tp[j]
alcgr $AHI,$ahi
lg $nlo,0($j,$np)
...
...
@@ -157,34 +166,29 @@ bn_mul_mont:
algr $nlo,$alo # +="tp[j]"
alcgr $NHI,$nhi
stg $nlo,
0($j,$f
p) # tp[j-1]=
aghi $j,8
# j++
jnz
.Linner
stg $nlo,
160-8($j,$s
p) # tp[j-1]=
la $j,8($j)
# j++
brct $count,
.Linner
algr $NHI,$AHI
lghi $AHI,0
alcgr $AHI,$AHI
alg $NHI,
8($fp)
# accumulate previous upmost overflow bit
alg $NHI,
160($j,$sp)
# accumulate previous upmost overflow bit
lghi $ahi,0
alcgr $AHI,$ahi # new upmost overflow bit
stg $NHI,
0($f
p)
stg $AHI,
8($f
p)
stg $NHI,
160-8($j,$s
p)
stg $AHI,
160($j,$s
p)
la $bp,8($bp) # bp++
clg $bp,16
+32($f
p) # compare to &bp[num]
clg $bp,16
0+8+32($j,$s
p) # compare to &bp[num]
jne .Louter
___
undef
$bi
;
$count
=
$bp
;
undef
$bp
;
$code
.=
<<___;
lg $rp,16+16($fp) # reincarnate rp
la $ap,8($fp)
lgr $j,$num
lg $rp,160+8+16($j,$sp) # reincarnate rp
la $ap,160($sp)
ahi $num,1 # restore $num, incidentally clears "borrow"
l
cgr $count,$num
sra $count,3 # incidentally clears "borrow"
l
a $j,0(%r0)
lr $count,$num
.Lsub: lg $alo,0($j,$ap)
slbg $alo,0($j,$np)
stg $alo,0($j,$rp)
...
...
@@ -198,15 +202,17 @@ $code.=<<___;
xgr $np,$AHI
ngr $np,$rp
ogr $ap,$np # ap=borrow?tp:rp
lgr $j,$num
la $j,0(%r0)
lgr $count,$num
.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
stg $j,
8($j,$f
p) # zap tp
stg $j,
160($j,$s
p) # zap tp
stg $alo,0($j,$rp)
aghi $j,8
jnz
.Lcopy
la $j,8($j)
brct $count,
.Lcopy
lmg %r6,%r15,16+48($fp)
la %r1,160+8+48($j,$sp)
lmg %r6,%r15,0(%r1)
lghi %r2,1 # signal "processed"
br %r14
.size bn_mul_mont,.-bn_mul_mont
...
...
crypto/s390xcpuid.S
0 → 100644
浏览文件 @
8626230a
.
text
.
globl
OPENSSL_s390x_facilities
.
type
OPENSSL_s390x_facilities
,@
function
.
align
16
OPENSSL_s390x_facilities
:
lghi
%
r0
,
0
.
long
0xb2b0f010
#
stfle
16
(%
r15
)
lg
%
r2
,
16
(%
r15
)
br
%
r14
.
size
OPENSSL_s390x_facilities
,.-
OPENSSL_s390x_facilities
.
globl
OPENSSL_rdtsc
.
type
OPENSSL_rdtsc
,@
function
.
align
16
OPENSSL_rdtsc
:
stck
16
(%
r15
)
lg
%
r2
,
16
(%
r15
)
br
%
r14
.
size
OPENSSL_rdtsc
,.-
OPENSSL_rdtsc
.
globl
OPENSSL_atomic_add
.
type
OPENSSL_atomic_add
,@
function
.
align
16
OPENSSL_atomic_add
:
l
%
r1
,
0
(%
r2
)
.
Lspin
:
lr
%
r0
,%
r1
ar
%
r0
,%
r3
cs
%
r1
,%
r0
,
0
(%
r2
)
brc
4
,
.
Lspin
lgfr
%
r2
,%
r0
#
OpenSSL
expects
the
new
value
br
%
r14
.
size
OPENSSL_atomic_add
,.-
OPENSSL_atomic_add
.
globl
OPENSSL_wipe_cpu
.
type
OPENSSL_wipe_cpu
,@
function
.
align
16
OPENSSL_wipe_cpu
:
xgr
%
r0
,%
r0
xgr
%
r1
,%
r1
lgr
%
r2
,%
r15
xgr
%
r3
,%
r3
xgr
%
r4
,%
r4
lzdr
%f0
lzdr
%f1
lzdr
%f2
lzdr
%f3
lzdr
%f4
lzdr
%f5
lzdr
%f6
lzdr
%f7
br
%
r14
.
size
OPENSSL_wipe_cpu
,.-
OPENSSL_wipe_cpu
.
globl
OPENSSL_cleanse
.
type
OPENSSL_cleanse
,@
function
.
align
16
OPENSSL_cleanse
:
lghi
%
r4
,
15
lghi
%
r0
,
0
clgr
%
r3
,%
r4
jh
.
Lot
.
Little
:
stc
%
r0
,
0
(%
r2
)
la
%
r2
,
1
(%
r2
)
brctg
%
r3
,
.
Little
br
%
r14
.
align
4
.
Lot
:
tmll
%
r2
,
7
jz
.
Laligned
stc
%
r0
,
0
(%
r2
)
la
%
r2
,
1
(%
r2
)
brctg
%
r3
,
.
Lot
.
Laligned
:
srlg
%
r4
,%
r3
,
3
.
Loop
:
stg
%
r0
,
0
(%
r2
)
la
%
r2
,
8
(%
r2
)
brctg
%
r4
,
.
Loop
lghi
%
r4
,
7
ngr
%
r3
,%
r4
jnz
.
Little
br
%
r14
.
size
OPENSSL_cleanse
,.-
OPENSSL_cleanse
crypto/sha/asm/sha1-s390x.pl
浏览文件 @
8626230a
...
...
@@ -15,14 +15,20 @@
# twist is that SHA1 hardware support is detected and utilized. In
# which case performance can reach further >4.5x for larger chunks.
# January 2009.
#
# Optimize Xupdate for amount of memory references and reschedule
# instructions to favour dual-issue z10 pipeline. On z10 hardware is
# "only" ~2.3x faster than software.
$kimdfunc
=
1
;
# magic function code for kimd instruction
$output
=
shift
;
open
STDOUT
,"
>
$output
";
$
t0
=
"
%r0
"
;
$
t1
=
"
%r1
";
$ctx
=
"
%r2
";
$
K_00_39
=
"
%r0
";
$K
=
$K_00_39
;
$
K_40_79
=
"
%r1
";
$ctx
=
"
%r2
";
$prefetch
=
"
%r2
";
$inp
=
"
%r3
";
$len
=
"
%r4
";
...
...
@@ -31,119 +37,107 @@ $B="%r6";
$C
=
"
%r7
";
$D
=
"
%r8
";
$E
=
"
%r9
";
@V
=
(
$A
,
$B
,
$C
,
$D
,
$E
);
$K_00_19
=
"
%r10
";
$K_20_39
=
"
%r11
";
$K_40_59
=
"
%r12
";
$K_60_79
=
"
%r13
";
$Xi
=
"
%r14
";
$t0
=
"
%r10
";
$t1
=
"
%r11
";
@X
=
("
%r12
","
%r13
","
%r14
");
$sp
=
"
%r15
";
$frame
=
160
+
16
*
4
;
sub
BODY_00_15
{
my
(
$i
,
$a
,
$b
,
$c
,
$d
,
$e
)
=
@_
;
my
$xi
=
(
$i
&
1
)?
$Xi:$t1
;
$code
.=<<
___
if
(
$i
<
16
&&
!
(
$i
&
1
));
lg
$Xi
,`
$i
*4
`(
$inp
)
___
$code
.=
<<___;
alr $e,$K_00_19 ### $i
rll $t0,$a,5
alr $e,$t0
lr $t0,$d
xr $t0,$c
nr $t0,$b
xr $t0,$d
alr $e,$t0
rll $b,$b,30
___
$code
.=<<
___
if
(
$i
<
16
&&
!
(
$i
&
1
));
srlg
$xi
,
$Xi
,
32
stg
$Xi
,`
160+
$i
*4
`(
$sp
)
___
$code
.=
<<___;
alr $e,$xi
___
}
sub
Xupdate
{
my
$i
=
shift
;
$code
.=<<
___
if
(
$i
==
15
);
lg
$prefetch
,
160
(
$sp
)
### Xupdate(16) warm-up
lr
$X
[
0
],
$X
[
2
]
___
return
if
(
$i
&
1
);
# Xupdate is vectorized and executed every 2nd cycle
$code
.=
<<___;
lg $Xi,`160+4*($i%16)`($sp) ### Xupdate($i)
xg $Xi,`160+4*(($i+2)%16)`($sp)
xg $Xi,`160+4*(($i+8)%16)`($sp)
$code
.=<<
___
if
(
$i
<
16
);
lg
$X
[
0
],`
$i
*4
`(
$inp
)
### Xload($i)
rllg
$X
[
1
],
$X
[
0
],
32
___
if
(((
$i
+
13
)
%
16
)
==
15
)
{
$code
.=
<<___;
llgf $t0,`160+4*15`($sp)
x $Xi,`160+0`($sp)
sllg $t0,$t0,32
xgr $Xi,$t0
$code
.=<<
___
if
(
$i
>=
16
);
xgr
$X
[
0
],
$prefetch
### Xupdate($i)
lg
$prefetch
,`
160+4*((
$i
+2)%16)
`(
$sp
)
xg
$X
[
0
],`
160+4*((
$i
+8)%16)
`(
$sp
)
xgr
$X
[
0
],
$prefetch
rll
$X
[
0
],
$X
[
0
],
1
rllg
$X
[
1
],
$X
[
0
],
32
rll
$X
[
1
],
$X
[
1
],
1
rllg
$X
[
0
],
$X
[
1
],
32
lr
$X
[
2
],
$X
[
1
]
# feedback
___
}
else
{
$code
.=
<<___;
xg $Xi,`160+4*(($i+13)%16)`($sp)
$code
.=<<
___
if
(
$i
<=
70
);
stg
$X
[
0
],`
160+4*(
$i
%16)
`(
$sp
)
___
unshift
(
@X
,
pop
(
@X
));
}
sub
BODY_00_19
{
my
(
$i
,
$a
,
$b
,
$c
,
$d
,
$e
)
=
@_
;
my
$xi
=
$X
[
1
];
&Xupdate
(
$i
);
$code
.=
<<___;
rll $Xi,$Xi,1
rllg $t1,$Xi,32
rll $t1,$t1,1
rllg $Xi,$t1,32
stg $Xi,`160+4*($i%16)`($sp)
alr $e,$K ### $i
rll $t1,$a,5
lr $t0,$d
xr $t0,$c
alr $e,$t1
nr $t0,$b
alr $e,$xi
xr $t0,$d
rll $b,$b,30
alr $e,$t0
___
}
sub
BODY_16_19
{
&Xupdate
(
@_
[
0
]);
&BODY_00_15
(
@
_
);
}
sub
BODY_20_39
{
my
(
$i
,
$a
,
$b
,
$c
,
$d
,
$e
)
=
@_
;
my
$xi
=
(
$i
&
1
)?
$Xi:$t1
;
my
$K_XX_XX
=
(
$i
<
40
)?
$K_20_39:$K_60_79
;
my
$xi
=
$X
[
1
];
&Xupdate
(
$i
);
$code
.=
<<___;
alr $e,$K_XX_XX ### $i
rll $t0,$a,5
alr $e,$t0
alr $e,$K ### $i
rll $t1,$a,5
lr $t0,$b
alr $e,$t1
xr $t0,$c
alr $e,$xi
xr $t0,$d
alr $e,$t0
rll $b,$b,30
alr $e,$
xi
alr $e,$
t0
___
}
sub
BODY_40_59
{
my
(
$i
,
$a
,
$b
,
$c
,
$d
,
$e
)
=
@_
;
my
$xi
=
(
$i
&
1
)?
$Xi:$t1
;
my
$xi
=
$X
[
1
]
;
&Xupdate
(
$i
);
$code
.=
<<___;
alr $e,$K_40_59 ### $i
rll $t0,$a,5
alr $e,$t0
alr $e,$K ### $i
rll $t1,$a,5
lr $t0,$b
alr $e,$t1
or $t0,$c
nr $t0,$d
alr $e,$xi
lr $t1,$b
nr $t0,$d
nr $t1,$c
alr $e,$xi
or $t0,$t1
alr $e,$t0
rll $b,$b,30
alr $e,$t0
___
}
$code
.=
<<___;
.text
.align 64
.type Ktable,\@object
Ktable: .long 0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
.skip 48 #.long 0,0,0,0,0,0,0,0,0,0,0,0
.size Ktable,.-Ktable
.globl sha1_block_data_order
.type sha1_block_data_order,\@function
sha1_block_data_order:
...
...
@@ -165,37 +159,43 @@ $code.=<<___ if ($kimdfunc);
.
Lsoftware:
___
$code
.=
<<___;
lghi %r1,-$frame
stg $ctx,16($sp)
stmg %r6,%r15,48($sp)
lgr %r0,$sp
aghi $sp,-$frame
la $sp,0(%r1,$sp)
stg %r0,0($sp)
sllg $len,$len,6
la $len,0($inp,$len)
larl $t0,Ktable
llgf $A,0($ctx)
llgf $B,4($ctx)
llgf $C,8($ctx)
llgf $D,12($ctx)
llgf $E,16($ctx)
llilh $K_00_19,0x5a82
oill $K_00_19,0x7999
llilh $K_20_39,0x6ed9
oill $K_20_39,0xeba1
llilh $K_40_59,0x8f1b
oill $K_40_59,0xbcdc
llilh $K_60_79,0xca62
oill $K_60_79,0xc1d6
lg $K_00_39,0($t0)
lg $K_40_79,8($t0)
.Lloop:
rllg $K_00_39,$K_00_39,32
___
for
(
$i
=
0
;
$i
<
20
;
$i
++
)
{
&BODY_00_19
(
$i
,
@V
);
unshift
(
@V
,
pop
(
@V
));
}
$code
.=
<<___;
rllg $K_00_39,$K_00_39,32
___
for
(
$i
=
0
;
$i
<
16
;
$i
++
)
{
&BODY_00_15
(
$i
,
@V
);
unshift
(
@V
,
pop
(
@V
));
}
for
(;
$i
<
20
;
$i
++
)
{
&BODY_16_19
(
$i
,
@V
);
unshift
(
@V
,
pop
(
@V
));
}
for
(;
$i
<
40
;
$i
++
)
{
&BODY_20_39
(
$i
,
@V
);
unshift
(
@V
,
pop
(
@V
));
}
$code
.=<<
___
;
$K
=
$K_40_79
;
rllg
$K_40_79
,
$K_40_79
,
32
___
for
(;
$i
<
60
;
$i
++
)
{
&BODY_40_59
(
$i
,
@V
);
unshift
(
@V
,
pop
(
@V
));
}
$code
.=
<<___;
rllg $K_40_79,$K_40_79,32
___
for
(;
$i
<
80
;
$i
++
)
{
&BODY_20_39
(
$i
,
@V
);
unshift
(
@V
,
pop
(
@V
));
}
$code
.=
<<___;
lg $ctx,`$frame+16`($sp)
la $inp,64($inp)
al $A,0($ctx)
al $B,4($ctx)
al $C,8($ctx)
...
...
@@ -206,9 +206,7 @@ $code.=<<___;
st $C,8($ctx)
st $D,12($ctx)
st $E,16($ctx)
la $inp,64($inp)
clgr $inp,$len
jne .Lloop
brct $len,.Lloop
lmg %r6,%r15,`$frame+48`($sp)
br %r14
...
...
crypto/sha/asm/sha512-s390x.pl
浏览文件 @
8626230a
...
...
@@ -20,9 +20,15 @@
#
# sha512_block_data_order is ~70% faster than gcc 3.3 generated code.
# January 2009.
#
# Add support for hardware SHA512 and reschedule instructions to
# favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster
# than software.
$t0
=
"
%r0
";
$t1
=
"
%r1
";
$ctx
=
"
%r2
";
$ctx
=
"
%r2
";
$t2
=
"
%r2
";
$inp
=
"
%r3
";
$len
=
"
%r4
";
# used as index in inner loop
...
...
@@ -54,7 +60,7 @@ if ($output =~ /512/) {
@sigma0
=
(
56
,
63
,
7
);
@sigma1
=
(
3
,
45
,
6
);
$rounds
=
80
;
$kimdfunc
=
0
;
# 0 means unknown/unsupported/unimplement
ed
$kimdfunc
=
3
;
# 0 means unknown/unsupported/unimplemented/disabl
ed
}
else
{
$label
=
"
256
";
$SZ
=
4
;
...
...
@@ -83,32 +89,32 @@ ___
$code
.=
<<___;
$ROT $t0,$e,$Sigma1[0]
$ROT $t1,$e,$Sigma1[1]
lgr $t2,$f
xgr $t0,$t1
$ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]`
xgr $t0,$t1 # Sigma1(e)
xgr $t2,$g
$ST $T1,`160+$SZ*($i%16)`($sp)
xgr $t0,$t1 # Sigma1(e)
la $T1,0($T1,$h) # T1+=h
ngr $t2,$e
lgr $t1,$a
algr $T1,$t0 # T1+=Sigma1(e)
algr $T1,$h # T1+=h
$ADD $T1,`$i*$SZ`($len,$tbl) # T1+=K[i]
lgr $t0,$f
xgr $t0,$g
ngr $t0,$e
xgr $t0,$g # Ch(e,f,g)
algr $T1,$t0 # T1+=Ch(e,f,g)
$ROT $h,$a,$Sigma0[0]
xgr $t2,$g # Ch(e,f,g)
$ADD $T1,`$i*$SZ`($len,$tbl) # T1+=K[i]
$ROT $t0,$a,$Sigma0[1]
algr $T1,$t2 # T1+=Ch(e,f,g)
ogr $t1,$b
xgr $h,$t0
lgr $t2,$a
ngr $t1,$c
$ROT $t0,$t0,`$Sigma0[2]-$Sigma0[1]`
xgr $h,$t0 # h=Sigma0(a)
lgr $t0,$a
ogr $t0,$b
ngr $t0,$c
lgr $t1,$a
ngr $t1,$b
ogr $t0,$t1 # Maj(a,b,c)
algr $h,$t0 # h+=Maj(a,b,c)
algr $d,$T1 # d+=T1
ngr $t2,$b
algr $h,$T1 # h+=T1
ogr $t2,$t1 # Maj(a,b,c)
la $d,0($d,$T1) # d+=T1
algr $h,$t2 # h+=Maj(a,b,c)
___
}
...
...
@@ -120,15 +126,15 @@ $code.=<<___;
$LD $t1,`160+$SZ*(($i+14)%16)`($sp)
$ROT $t0,$T1,$sigma0[0]
$SHR $T1,$sigma0[2]
$ROT $t2,$t0,`$sigma0[1]-$sigma0[0]`
xgr $T1,$t0
$ROT $t0,$t0,`$sigma0[1]-$sigma0[0]`
xgr $T1,$t0 # sigma0(X[i+1])
$ROT $t0,$t1,$sigma1[0]
$ADD $T1,`160+$SZ*($i%16)`($sp) # +=X[i]
xgr $T1,$t2 # sigma0(X[i+1])
$SHR $t1,$sigma1[2]
$ADD $T1,`160+$SZ*($i%16)`($sp) # +=X[i]
xgr $t1,$t0
$ADD $T1,`160+$SZ*(($i+9)%16)`($sp) # +=X[i+9]
$ROT $t0,$t0,`$sigma1[1]-$sigma1[0]`
$ADD $T1,`160+$SZ*(($i+9)%16)`($sp) # +=X[i+9]
xgr $t1,$t0 # sigma1(X[i+14])
algr $T1,$t1 # +=sigma1(X[i+14])
___
...
...
@@ -225,15 +231,14 @@ $code.=<<___ if ($kimdfunc);
___
$code
.=
<<___;
sllg $len,$len,`log(16*$SZ)/log(2)`
la $len,0($inp,$len)
stmg $len,%r15,32($sp)
lghi %r1,-$frame
agr $len,$inp
stmg $ctx,%r15,16($sp)
lgr %r0,$sp
aghi $sp,-$frame
la $sp,0(%r1,$sp)
stg %r0,0($sp)
bras $tbl,.Lpic
.Lpic: aghi $tbl,$Table-.Lpic
larl $tbl,$Table
$LD $A,`0*$SZ`($ctx)
$LD $B,`1*$SZ`($ctx)
$LD $C,`2*$SZ`($ctx)
...
...
@@ -255,6 +260,8 @@ $code.=<<___;
clgr $len,$t0
jne .Lrounds_16_xx
lg $ctx,`$frame+16`($sp)
la $inp,`16*$SZ`($inp)
$ADD $A,`0*$SZ`($ctx)
$ADD $B,`1*$SZ`($ctx)
$ADD $C,`2*$SZ`($ctx)
...
...
@@ -271,7 +278,6 @@ $code.=<<___;
$ST $F,`5*$SZ`($ctx)
$ST $G,`6*$SZ`($ctx)
$ST $H,`7*$SZ`($ctx)
la $inp,`16*$SZ`($inp)
clg $inp,`$frame+32`($sp)
jne .Lloop
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录