Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenHarmony
Third Party Openssl
提交
68ea6068
T
Third Party Openssl
项目概览
OpenHarmony
/
Third Party Openssl
1 年多 前同步成功
通知
10
Star
18
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
Third Party Openssl
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
68ea6068
编写于
12月 15, 2005
作者:
A
Andy Polyakov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add IALU-only bn_mul_mont for SPARCv9. See commentary section for details.
上级
6df8c74d
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
623 addition
and
0 deletion
+623
-0
crypto/bn/asm/sparcv9-mont.pl
crypto/bn/asm/sparcv9-mont.pl
+623
-0
未找到文件。
crypto/bn/asm/sparcv9-mont.pl
0 → 100644
浏览文件 @
68ea6068
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
# ====================================================================
# December 2005
#
# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
# for undertaken effort are multiple. First of all, UltraSPARC is not
# the whole SPARCv9 universe and other VIS-free implementations deserve
# optimized code as much. Secondly, newly introduced UltraSPARC T1,
# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
# several integrated RSA/DSA accelerator circuits accessible through
# kernel driver [only(*)], but having decent user-land software
# implementation is important too. Finally, reasons like desire to
# experiment with dedicated squaring procedure. Yes, this module
# implements one, because it was easiest to draft it in SPARCv9
# instructions...
# (*) Engine accessing the driver in question is on my TODO list.
# For reference, acceleator is estimated to give 6 to 10 times
# improvement on single-threaded RSA sign. It should be noted
# that 6-10x improvement coefficient does not actually mean
# something extraordinary in terms of absolute [single-threaded]
# performance, as SPARCv9 instruction set is by all means least
# suitable for high performance crypto among other 64 bit
# platforms. 6-10x factor simply places T1 in same performance
# domain as say AMD64 and IA-64. Improvement of RSA verify don't
# appear impressive at all, but it's the sign operation which is
# far more critical/interesting.
# You might notice that inner loops are modulo-scheduled:-) This has
# essentially negligible impact on UltraSPARC performance, it's
# Fujitsu SPARC64 V users who should notice and hopefully appreciate
# the advantage... Currently this module surpasses sparcv9a-mont.pl
# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
# module still have hidden potential [see TODO list there], which is
# estimated to be larger than 20%...
# int bn_mul_mont(
$rp
=
"
%i0
";
# BN_ULONG *rp,
$ap
=
"
%i1
";
# const BN_ULONG *ap,
$bp
=
"
%i2
";
# const BN_ULONG *bp,
$np
=
"
%i3
";
# const BN_ULONG *np,
$n0
=
"
%i4
";
# const BN_ULONG *n0,
$num
=
"
%i5
";
# int num);
$bits
=
32
;
for
(
@ARGV
)
{
$bits
=
64
if
(
/\-m64/
||
/\-xarch\=v9/
);
}
if
(
$bits
==
64
)
{
$bias
=
2047
;
$frame
=
192
;
}
else
{
$bias
=
0
;
$frame
=
128
;
}
$car0
=
"
%o0
";
$car1
=
"
%o1
";
$car2
=
"
%o2
";
# 1 bit
$acc0
=
"
%o3
";
$acc1
=
"
%o4
";
$mask
=
"
%g1
";
# 32 bits, what a waste...
$tmp0
=
"
%g4
";
$tmp1
=
"
%g5
";
$i
=
"
%l0
";
$j
=
"
%l1
";
$mul0
=
"
%l2
";
$mul1
=
"
%l3
";
$tp
=
"
%l4
";
$apj
=
"
%l5
";
$npj
=
"
%l6
";
$tpj
=
"
%l7
";
$fname
=
"
bn_mul_mont
";
$code
=
<<___;
.section ".text",#alloc,#execinstr
.global $fname
.align 32
$fname:
cmp %o5,4 ! 128 bits minimum
bge,pt %icc,.Lenter
sethi %hi(0xffffffff),$mask
retl
clr %o0
.align 32
.Lenter:
save %sp,-$frame,%sp
sll $num,2,$num ! num*=4
or $mask,%lo(0xffffffff),$mask
ld [$n0],$n0
cmp $ap,$bp
and $num,$mask,$num
ld [$bp],$mul0 ! bp[0]
be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
nop
add %sp,$bias,%o7 ! real top of stack
ld [$ap],$car0 ! ap[0]
sub %o7,$num,%o7
ld [$ap+4],$apj ! ap[1]
and %o7,-1024,%o7
ld [$np],$car1 ! np[0]
sub %o7,$bias,%sp ! alloca
ld [$np+4],$npj ! np[1]
mov 12,$j
mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
and $car0,$mask,$acc0
add %sp,$bias+$frame,$tp
ld [$ap+8],$apj !prologue!
mulx $n0,$acc0,$mul1 ! "t[0]"*n0
and $mul1,$mask,$mul1
mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
srlx $car0,32,$car0
add $acc0,$car1,$car1
ld [$np+8],$npj !prologue!
srlx $car1,32,$car1
mov $tmp0,$acc0 !prologue!
.L1st:
mulx $apj,$mul0,$tmp0
mulx $npj,$mul1,$tmp1
add $acc0,$car0,$car0
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
add $acc1,$car1,$car1
ld [$np+$j],$npj ! np[j]
srlx $car0,32,$car0
add $acc0,$car1,$car1
add $j,4,$j ! j++
mov $tmp0,$acc0
st $car1,[$tp]
cmp $j,$num
mov $tmp1,$acc1
srlx $car1,32,$car1
bl %icc,.L1st
add $tp,4,$tp ! tp++
!.L1st
mulx $apj,$mul0,$tmp0 !epilogue!
mulx $npj,$mul1,$tmp1
add $acc0,$car0,$car0
and $car0,$mask,$acc0
add $acc1,$car1,$car1
srlx $car0,32,$car0
add $acc0,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $tmp0,$car0,$car0
and $car0,$mask,$acc0
add $tmp1,$car1,$car1
srlx $car0,32,$car0
add $acc0,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car1
add $car0,$car1,$car1
st $car1,[$tp+8]
srlx $car1,32,$car2
mov 4,$i ! i++
ld [$bp+4],$mul0 ! bp[1]
.Louter:
add %sp,$bias+$frame,$tp
ld [$ap],$car0 ! ap[0]
ld [$ap+4],$apj ! ap[1]
ld [$np],$car1 ! np[0]
ld [$np+4],$npj ! np[1]
ld [$tp],$tmp1 ! tp[0]
ld [$tp+4],$tpj ! tp[1]
mov 12,$j
mulx $car0,$mul0,$car0
mulx $apj,$mul0,$tmp0 !prologue!
add $tmp1,$car0,$car0
ld [$ap+8],$apj !prologue!
and $car0,$mask,$acc0
mulx $n0,$acc0,$mul1
and $mul1,$mask,$mul1
mulx $car1,$mul1,$car1
mulx $npj,$mul1,$acc1 !prologue!
srlx $car0,32,$car0
add $acc0,$car1,$car1
ld [$np+8],$npj !prologue!
srlx $car1,32,$car1
mov $tmp0,$acc0 !prologue!
.Linner:
mulx $apj,$mul0,$tmp0
mulx $npj,$mul1,$tmp1
add $tpj,$car0,$car0
ld [$ap+$j],$apj ! ap[j]
add $acc0,$car0,$car0
add $acc1,$car1,$car1
ld [$np+$j],$npj ! np[j]
and $car0,$mask,$acc0
ld [$tp+8],$tpj ! tp[j]
srlx $car0,32,$car0
add $acc0,$car1,$car1
add $j,4,$j ! j++
mov $tmp0,$acc0
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
mov $tmp1,$acc1
cmp $j,$num
bl %icc,.Linner
add $tp,4,$tp ! tp++
!.Linner
mulx $apj,$mul0,$tmp0 !epilogue!
mulx $npj,$mul1,$tmp1
add $tpj,$car0,$car0
add $acc0,$car0,$car0
ld [$tp+8],$tpj ! tp[j]
and $car0,$mask,$acc0
add $acc1,$car1,$car1
srlx $car0,32,$car0
add $acc0,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
add $tpj,$car0,$car0
add $tmp0,$car0,$car0
and $car0,$mask,$acc0
add $tmp1,$car1,$car1
add $acc0,$car1,$car1
st $car1,[$tp+4] ! tp[j-1]
srlx $car0,32,$car0
add $i,4,$i ! i++
srlx $car1,32,$car1
add $car0,$car1,$car1
cmp $i,$num
add $car2,$car1,$car1
st $car1,[$tp+8]
srlx $car1,32,$car2
bl,a %icc,.Louter
ld [$bp+$i],$mul0 ! bp[i]
!.Louter
add $tp,12,$tp
.Ltail:
add $np,$num,$np
add $rp,$num,$rp
cmp $car2,0 ! clears %icc.c
bne,pn %icc,.Lsub
sub %g0,$num,%o7 ! k=-num
cmp $car1,$npj ! compare top-most $tp and $np words
bcs,pt %icc,.Lcopy ! %icc.c is clean if not taken
nop
.align 16,0x1000000
.Lsub:
ld [$tp+%o7],%o0
ld [$np+%o7],%o1
subccc %o0,%o1,%o1
st %o1,[$rp+%o7]
add %o7,4,%o7
brnz %o7,.Lsub
nop
subccc $car2,0,$car2
bcc %icc,.Lzap
sub %g0,$num,%o7
.align 16,0x1000000
.Lcopy:
ld [$tp+%o7],%o0
st %o0,[$rp+%o7]
add %o7,4,%o7
brnz %o7,.Lcopy
nop
ba .Lzap
sub %g0,$num,%o7
.align 32
.Lzap:
st %g0,[$tp+%o7]
add %o7,4,%o7
brnz %o7,.Lzap
nop
mov 1,%i0
ret
restore
___
########
######## bn_sqr_mont gives up to 20% improvement over above code
########
$sbit
=
"
%i2
";
# re-use $bp!
$code
.=
<<___;
.align 32
.Lbn_sqr_mont:
add %sp,$bias,%o7 ! real top of stack
ld [$ap+4],$apj ! ap[1]
sub %o7,$num,%o7
ld [$np],$car1 ! np[0]
and %o7,-1024,%o7
ld [$np+4],$npj ! np[1]
sub %o7,$bias,%sp ! alloca
mov 12,$j
mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
mulx $apj,$mul0,$tmp0 !prologue!
and $car0,$mask,$acc0
add %sp,$bias+$frame,$tp
ld [$ap+8],$apj !prologue!
mulx $n0,$acc0,$mul1 ! "t[0]"*n0
srlx $car0,32,$car0
and $mul1,$mask,$mul1
mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
mulx $npj,$mul1,$acc1 !prologue!
and $car0,1,$sbit
ld [$np+8],$npj !prologue!
srlx $car0,1,$car0
add $acc0,$car1,$car1
srlx $car1,32,$car1
mov $tmp0,$acc0 !prologue!
.Lsqr_1st:
mulx $apj,$mul0,$tmp0
mulx $npj,$mul1,$tmp1
add $acc0,$car0,$car0 ! ap[j]*a0+c0
add $acc1,$car1,$car1
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
ld [$np+$j],$npj ! np[j]
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
mov $tmp1,$acc1
srlx $acc0,32,$sbit
add $j,4,$j ! j++
and $acc0,$mask,$acc0
cmp $j,$num
add $acc0,$car1,$car1
st $car1,[$tp]
mov $tmp0,$acc0
srlx $car1,32,$car1
bl %icc,.Lsqr_1st
add $tp,4,$tp ! tp++
!.Lsqr_1st
mulx $apj,$mul0,$tmp0 ! epilogue
mulx $npj,$mul1,$tmp1
add $acc0,$car0,$car0 ! ap[j]*a0+c0
add $acc1,$car1,$car1
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $tmp0,$car0,$car0 ! ap[j]*a0+c0
add $tmp1,$car1,$car1
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car1
add $car0,$car0,$car0
or $sbit,$car0,$car0
add $car0,$car1,$car1
st $car1,[$tp+8]
srlx $car1,32,$car2
ld [%sp+$bias+$frame],$tmp0 ! tp[0]
ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
ld [%sp+$bias+$frame+8],$tpj ! tp[2]
ld [$ap+4],$mul0 ! ap[1]
ld [$ap+8],$apj ! ap[2]
ld [$np],$car1 ! np[0]
ld [$np+4],$npj ! np[1]
mulx $n0,$tmp0,$mul1
mulx $mul0,$mul0,$car0
and $mul1,$mask,$mul1
mulx $car1,$mul1,$car1
mulx $npj,$mul1,$acc1
add $tmp0,$car1,$car1
and $car0,$mask,$acc0
ld [$np+8],$npj ! np[2]
srlx $car1,32,$car1
add $tmp1,$car1,$car1
srlx $car0,32,$car0
add $acc0,$car1,$car1
and $car0,1,$sbit
add $acc1,$car1,$car1
srlx $car0,1,$car0
mov 12,$j
st $car1,[%sp+$bias+$frame] ! tp[0]=
srlx $car1,32,$car1
add %sp,$bias+$frame+4,$tp
.Lsqr_2nd:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
add $tpj,$car1,$car1
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
ld [$np+$j],$npj ! np[j]
srlx $car0,32,$car0
add $acc1,$car1,$car1
ld [$tp+8],$tpj ! tp[j]
add $acc0,$acc0,$acc0
add $j,4,$j ! j++
or $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
cmp $j,$num
add $acc0,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
bl %icc,.Lsqr_2nd
add $tp,4,$tp ! tp++
!.Lsqr_2nd
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
add $tpj,$car1,$car1
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc1,$car1,$car1
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
add $car0,$car0,$car0
or $sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car2
ld [%sp+$bias+$frame],$tmp1 ! tp[0]
ld [%sp+$bias+$frame+4],$tpj ! tp[1]
ld [$ap+8],$mul0 ! ap[2]
ld [$np],$car1 ! np[0]
ld [$np+4],$npj ! np[1]
mulx $n0,$tmp1,$mul1
and $mul1,$mask,$mul1
mov 8,$i
mulx $mul0,$mul0,$car0
mulx $car1,$mul1,$car1
and $car0,$mask,$acc0
add $tmp1,$car1,$car1
srlx $car0,32,$car0
add %sp,$bias+$frame,$tp
srlx $car1,32,$car1
and $car0,1,$sbit
srlx $car0,1,$car0
mov 4,$j
.Lsqr_outer:
.Lsqr_inner1:
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $j,4,$j
ld [$tp+8],$tpj
cmp $j,$i
add $acc1,$car1,$car1
ld [$np+$j],$npj
st $car1,[$tp]
srlx $car1,32,$car1
bl %icc,.Lsqr_inner1
add $tp,4,$tp
!.Lsqr_inner1
add $j,4,$j
ld [$ap+$j],$apj ! ap[j]
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
ld [$np+$j],$npj ! np[j]
add $acc0,$car1,$car1
ld [$tp+8],$tpj ! tp[j]
add $acc1,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $j,4,$j
cmp $j,$num
be,pn %icc,.Lsqr_no_inner2
add $tp,4,$tp
.Lsqr_inner2:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $acc0,$car0,$car0
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
ld [$np+$j],$npj ! np[j]
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
ld [$tp+8],$tpj ! tp[j]
or $sbit,$acc0,$acc0
add $j,4,$j ! j++
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
cmp $j,$num
add $acc0,$car1,$car1
add $acc1,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
bl %icc,.Lsqr_inner2
add $tp,4,$tp ! tp++
.Lsqr_no_inner2:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $acc0,$car0,$car0
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
add $acc1,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
add $car0,$car0,$car0
or $sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car2
add $i,4,$i ! i++
ld [%sp+$bias+$frame],$tmp1 ! tp[0]
ld [%sp+$bias+$frame+4],$tpj ! tp[1]
ld [$ap+$i],$mul0 ! ap[j]
ld [$np],$car1 ! np[0]
ld [$np+4],$npj ! np[1]
mulx $n0,$tmp1,$mul1
and $mul1,$mask,$mul1
add $i,4,$tmp0
mulx $mul0,$mul0,$car0
mulx $car1,$mul1,$car1
and $car0,$mask,$acc0
add $tmp1,$car1,$car1
srlx $car0,32,$car0
add %sp,$bias+$frame,$tp
srlx $car1,32,$car1
and $car0,1,$sbit
srlx $car0,1,$car0
cmp $tmp0,$num ! i<num-1
bl %icc,.Lsqr_outer
mov 4,$j
.Lsqr_last:
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $j,4,$j
ld [$tp+8],$tpj
cmp $j,$i
add $acc1,$car1,$car1
ld [$np+$j],$npj
st $car1,[$tp]
srlx $car1,32,$car1
bl %icc,.Lsqr_last
add $tp,4,$tp
!.Lsqr_last
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $acc0,$car1,$car1
add $acc1,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $car0,$car0,$car0 ! recover $car0
or $sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car2
ba .Ltail
add $tp,8,$tp
.type $fname,#function
.size $fname,(.-$fname)
___
$code
=~
s/\`([^\`]*)\`/eval($1)/g
em
;
print
$code
;
close
STDOUT
;
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录