Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenHarmony
Third Party Openssl
提交
a5bb5bca
T
Third Party Openssl
项目概览
OpenHarmony
/
Third Party Openssl
大约 1 年 前同步成功
通知
9
Star
18
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
Third Party Openssl
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
a5bb5bca
编写于
10月 03, 2013
作者:
A
Andy Polyakov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
bn/asm/x86_64-mont*.pl: add MULX/ADCX/ADOX code path.
上级
87954638
变更
2
展开全部
隐藏空白更改
内联
并排
Showing
2 changed file
with
1622 addition
and
38 deletion
+1622
-38
crypto/bn/asm/x86_64-mont.pl
crypto/bn/asm/x86_64-mont.pl
+1196
-38
crypto/bn/asm/x86_64-mont5.pl
crypto/bn/asm/x86_64-mont5.pl
+426
-0
未找到文件。
crypto/bn/asm/x86_64-mont.pl
浏览文件 @
a5bb5bca
此差异已折叠。
点击以展开。
crypto/bn/asm/x86_64-mont5.pl
浏览文件 @
a5bb5bca
...
...
@@ -31,6 +31,21 @@ die "can't locate x86_64-xlate.pl";
open
OUT
,"
|
\"
$^X
\"
$xlate
$flavour
$output
";
*STDOUT
=
*OUT
;
if
(`
$ENV
{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1
`
=~
/GNU assembler version ([2-9]\.[0-9]+)/
)
{
$addx
=
(
$
1
>=
2.22
);
}
if
(
!
$addx
&&
$win64
&&
(
$flavour
=~
/nasm/
||
$ENV
{
ASM
}
=~
/nasm/
)
&&
`
nasm -v 2>&1
`
=~
/NASM version ([2-9]\.[0-9]+)/
)
{
$addx
=
(
$
1
>=
2.10
);
}
if
(
!
$addx
&&
$win64
&&
(
$flavour
=~
/masm/
||
$ENV
{
ASM
}
=~
/ml64/
)
&&
`
ml64 2>&1
`
=~
/Version ([0-9]+)\./
)
{
$addx
=
(
$
1
>=
11
);
}
# int bn_mul_mont_gather5(
$rp
=
"
%rdi
";
# BN_ULONG *rp,
$ap
=
"
%rsi
";
# const BN_ULONG *ap,
...
...
@@ -53,6 +68,8 @@ $m1="%rbp";
$code
=
<<___;
.text
.extern OPENSSL_ia32cap_P
.globl bn_mul_mont_gather5
.type bn_mul_mont_gather5,\@function,6
.align 64
...
...
@@ -61,6 +78,11 @@ bn_mul_mont_gather5:
jnz .Lmul_enter
cmp \$8,${num}d
jb .Lmul_enter
___
$code
.=<<
___
if
(
$addx
);
mov
OPENSSL_ia32cap_P
+
8
(
%rip
),
%r11d
___
$code
.=
<<___;
jmp .Lmul4x_enter
.align 16
...
...
@@ -347,6 +369,13 @@ $code.=<<___;
.align 16
bn_mul4x_mont_gather5:
.Lmul4x_enter:
___
$code
.=<<
___
if
(
$addx
);
and
\
$
0x80100
,
%r11d
cmp
\
$
0x80100
,
%r11d
je
.
Lmulx4x_enter
___
$code
.=
<<___;
mov ${num}d,${num}d
mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
push %rbx
...
...
@@ -828,7 +857,404 @@ $code.=<<___;
.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
___
}}}
if
(
$addx
)
{{{
my
$bp
=
"
%rdx
";
# original value
$code
.=
<<___;
.type bn_mulx4x_mont_gather5,\@function,6
.align 32
bn_mulx4x_mont_gather5:
.Lmulx4x_enter:
mov %rsp,%rax
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
___
$code
.=<<
___
if
(
$win64
);
lea
-
0x28
(
%rsp
),
%rsp
movaps
%xmm6
,(
%rsp
)
movaps
%xmm7
,
0x10
(
%rsp
)
___
$code
.=
<<___;
shl \$3,${num}d # convert $num to bytes
xor %r10,%r10
mov %rsp,%r11 # put aside %rsp
sub $num,%r10 # -$num
mov ($n0),$n0 # *n0
lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8)
and \$-128,%rsp
##############################################################
# Stack layout
# +0 num
# +8 off-loaded &b[i]
# +16 end of b[num]
# +24 saved n0
# +32 saved rp
# +40
# +48 inner counter
# +56 saved %rsp
# +64 tmp[num+1]
#
mov $num,0(%rsp) # save $num
shl \$5,$num
lea 256($bp,$num),%r10
shr \$5+5,$num
mov %r10,16(%rsp) # end of b[num]
sub \$1,$num
mov $n0, 24(%rsp) # save *n0
mov $rp, 32(%rsp) # save $rp
mov $num,48(%rsp) # inner counter
mov %r11,56(%rsp) # save original %rsp
jmp .Lmulx4x_body
.align 32
.Lmulx4x_body:
___
my
(
$aptr
,
$bptr
,
$nptr
,
$tptr
,
$mi
,
$bi
,
$zero
,
$num
)
=
("
%rsi
","
%rdi
","
%rcx
","
%rbx
","
%r8
","
%r9
","
%rbp
","
%rax
");
my
$rptr
=
$bptr
;
my
$STRIDE
=
2
**
5
*
8
;
# 5 is "window size"
my
$N
=
$STRIDE
/
4
;
# should match cache line size
$code
.=
<<___;
mov `($win64?56:8)`(%rax),%r10d # load 7th argument
mov %r10,%r11
shr \$`log($N/8)/log(2)`,%r10
and \$`$N/8-1`,%r11
not %r10
lea .Lmagic_masks(%rip),%rax
and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
lea 96($bp,%r11,8),$bptr # pointer within 1st cache line
movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
movq 8(%rax,%r10,8),%xmm5 # cache line contains element
movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
movq 24(%rax,%r10,8),%xmm7
movq `0*$STRIDE/4-96`($bptr),%xmm0
movq `1*$STRIDE/4-96`($bptr),%xmm1
pand %xmm4,%xmm0
movq `2*$STRIDE/4-96`($bptr),%xmm2
pand %xmm5,%xmm1
movq `3*$STRIDE/4-96`($bptr),%xmm3
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
por %xmm2,%xmm0
lea $STRIDE($bptr),$bptr
por %xmm3,%xmm0
movq %xmm0,%rdx # bp[0]
movq `0*$STRIDE/4-96`($bptr),%xmm0
movq `1*$STRIDE/4-96`($bptr),%xmm1
pand %xmm4,%xmm0
movq `2*$STRIDE/4-96`($bptr),%xmm2
pand %xmm5,%xmm1
lea 64+32(%rsp),$tptr
mov %rdx,$bi
xor $zero,$zero # of=0,cf=0
mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
mulx 1*8($aptr),%r11,%r14 # a[1]*b[0]
adcx %rax,%r11
mulx 2*8($aptr),%r12,%r13 # ...
adcx %r14,%r12
adcx $zero,%r13
movq `3*$STRIDE/4-96`($bptr),%xmm3
lea $STRIDE($bptr),%r10 # next &b[i]
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
mov $mi,$bptr # borrow $bptr
imulq 24(%rsp),$mi # "t[0]"*n0
xor $zero,$zero # cf=0, of=0
por %xmm2,%xmm0
por %xmm3,%xmm0
mov %r10,8(%rsp) # off-load &b[i]
mulx 3*8($aptr),%rax,%r14
mov $mi,%rdx
lea 4*8($aptr),$aptr
adcx %rax,%r13
adcx $zero,%r14 # cf=0
mulx 0*8($nptr),%rax,%r10
adcx %rax,$bptr # discarded
adox %r11,%r10
mulx 1*8($nptr),%rax,%r11
adcx %rax,%r10
adox %r12,%r11
mulx 2*8($nptr),%rax,%r12
mov 48(%rsp),$bptr # counter value
mov %r10,-4*8($tptr)
adcx %rax,%r11
adox %r13,%r12
mulx 3*8($nptr),%rax,%r15
mov $bi,%rdx
mov %r11,-3*8($tptr)
adcx %rax,%r12
adox $zero,%r15 # of=0
lea 4*8($nptr),$nptr
mov %r12,-2*8($tptr)
jmp .Lmulx4x_1st
.align 32
.Lmulx4x_1st:
adcx $zero,%r15 # cf=0, modulo-scheduled
mulx 0*8($aptr),%r10,%rax # a[4]*b[0]
adcx %r14,%r10
mulx 1*8($aptr),%r11,%r14 # a[5]*b[0]
adcx %rax,%r11
mulx 2*8($aptr),%r12,%rax # ...
adcx %r14,%r12
mulx 3*8($aptr),%r13,%r14
.byte 0x66,0x66
mov $mi,%rdx
adcx %rax,%r13
adcx $zero,%r14 # cf=0
lea 4*8($aptr),$aptr
lea 4*8($tptr),$tptr
adox %r15,%r10
mulx 0*8($nptr),%rax,%r15
adcx %rax,%r10
adox %r15,%r11
mulx 1*8($nptr),%rax,%r15
adcx %rax,%r11
adox %r15,%r12
.byte 0x3e
mulx 2*8($nptr),%rax,%r15
mov %r10,-5*8($tptr)
mov %r11,-4*8($tptr)
adcx %rax,%r12
adox %r15,%r13
mulx 3*8($nptr),%rax,%r15
mov $bi,%rdx
mov %r12,-3*8($tptr)
adcx %rax,%r13
adox $zero,%r15
lea 4*8($nptr),$nptr
mov %r13,-2*8($tptr)
dec $bptr # of=0, pass cf
jnz .Lmulx4x_1st
mov 0(%rsp),$num # load num
mov 8(%rsp),$bptr # re-load &b[i]
movq %xmm0,%rdx # bp[1]
adc $zero,%r15 # modulo-scheduled
add %r15,%r14
sbb %r15,%r15 # top-most carry
mov %r14,-1*8($tptr)
jmp .Lmulx4x_outer
.align 32
.Lmulx4x_outer:
sub $num,$aptr # rewind $aptr
mov %r15,($tptr) # save top-most carry
mov 64(%rsp),%r10
lea 64(%rsp),$tptr
sub $num,$nptr # rewind $nptr
xor $zero,$zero # cf=0, of=0
mov %rdx,$bi
movq `0*$STRIDE/4-96`($bptr),%xmm0
movq `1*$STRIDE/4-96`($bptr),%xmm1
pand %xmm4,%xmm0
movq `2*$STRIDE/4-96`($bptr),%xmm2
pand %xmm5,%xmm1
mulx 0*8($aptr),$mi,%rax # a[0]*b[i]
adox %r10,$mi
mov 1*8($tptr),%r10
mulx 1*8($aptr),%r11,%r14 # a[1]*b[i]
adcx %rax,%r11
mulx 2*8($aptr),%r12,%r13 # ...
adox %r10,%r11
adcx %r14,%r12
adox $zero,%r12
adcx $zero,%r13
movq `3*$STRIDE/4-96`($bptr),%xmm3
lea $STRIDE($bptr),%r10 # next &b[i]
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
mov $mi,$bptr # borrow $bptr
imulq 24(%rsp),$mi # "t[0]"*n0
xor $zero,$zero # cf=0, of=0
por %xmm2,%xmm0
por %xmm3,%xmm0
mov %r10,8(%rsp) # off-load &b[i]
mov 2*8($tptr),%r10
mulx 3*8($aptr),%rax,%r14
mov $mi,%rdx
adox %r10,%r12
adcx %rax,%r13
adox 3*8($tptr),%r13
adcx $zero,%r14
lea 4*8($aptr),$aptr
lea 4*8($tptr),$tptr
adox $zero,%r14
mulx 0*8($nptr),%rax,%r10
adcx %rax,$bptr # discarded
adox %r11,%r10
mulx 1*8($nptr),%rax,%r11
adcx %rax,%r10
adox %r12,%r11
mulx 2*8($nptr),%rax,%r12
.byte 0x3e
mov %r10,-4*8($tptr)
.byte 0x3e
mov 0*8($tptr),%r10
adcx %rax,%r11
adox %r13,%r12
mulx 3*8($nptr),%rax,%r15
mov $bi,%rdx
mov %r11,-3*8($tptr)
adcx %rax,%r12
adox $zero,%r15 # of=0
mov 48(%rsp),$bptr # counter value
mov %r12,-2*8($tptr)
lea 4*8($nptr),$nptr
jmp .Lmulx4x_inner
.align 32
.Lmulx4x_inner:
adcx $zero,%r15 # cf=0, modulo-scheduled
adox %r10,%r14
mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
mov 1*8($tptr),%r13
adcx %r14,%r10
mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
adox %rax,%r11
mulx 2*8($aptr),%r12,%rax # ...
adcx %r13,%r11
adox %r14,%r12
mulx 3*8($aptr),%r13,%r14
mov $mi,%rdx
adcx 2*8($tptr),%r12
adox %rax,%r13
adcx 3*8($tptr),%r13
adox $zero,%r14 # of=0
lea 4*8($aptr),$aptr
.byte 0x48,0x8d,0x9b,0x20,0x00,0x00,0x00 # lea 4*8($tptr),$tptr
adcx $zero,%r14 # cf=0
adox %r15,%r10
.byte 0x3e,0xc4,0x62,0xfb,0xf6,0x79,0x00 # mulx 0*8($nptr),%rax,%r15
adcx %rax,%r10
adox %r15,%r11
mulx 1*8($nptr),%rax,%r15
adcx %rax,%r11
adox %r15,%r12
mulx 2*8($nptr),%rax,%r15
mov %r10,-5*8($tptr)
mov 0*8($tptr),%r10
adcx %rax,%r12
adox %r15,%r13
mulx 3*8($nptr),%rax,%r15
mov $bi,%rdx
mov %r11,-4*8($tptr)
mov %r12,-3*8($tptr)
adcx %rax,%r13
adox $zero,%r15
lea 4*8($nptr),$nptr
mov %r13,-2*8($tptr)
dec $bptr # of=0, pass cf
jnz .Lmulx4x_inner
mov 0(%rsp),$num # load num
mov 8(%rsp),$bptr # re-load &b[i]
movq %xmm0,%rdx # bp[i+1]
adc $zero,%r15 # modulo-scheduled
sub %r10,$zero # pull top-most carry
adc %r15,%r14
sbb %r15,%r15 # top-most carry
mov %r14,-1*8($tptr)
cmp 16(%rsp),$bptr
jb .Lmulx4x_outer
neg $num
mov 32(%rsp),$rptr # restore rp
lea 64(%rsp),$tptr
xor %rdx,%rdx
pxor %xmm0,%xmm0
mov 0*8($nptr,$num),%r8
mov 1*8($nptr,$num),%r9
neg %r8
jmp .Lmulx4x_sub_entry
.align 32
.Lmulx4x_sub:
mov 0*8($nptr,$num),%r8
mov 1*8($nptr,$num),%r9
not %r8
.Lmulx4x_sub_entry:
mov 2*8($nptr,$num),%r10
not %r9
and %r15,%r8
mov 3*8($nptr,$num),%r11
not %r10
and %r15,%r9
not %r11
and %r15,%r10
and %r15,%r11
neg %rdx # mov %rdx,%cf
adc 0*8($tptr),%r8
adc 1*8($tptr),%r9
movdqa %xmm0,($tptr)
adc 2*8($tptr),%r10
adc 3*8($tptr),%r11
movdqa %xmm0,16($tptr)
lea 4*8($tptr),$tptr
sbb %rdx,%rdx # mov %cf,%rdx
mov %r8,0*8($rptr)
mov %r9,1*8($rptr)
mov %r10,2*8($rptr)
mov %r11,3*8($rptr)
lea 4*8($rptr),$rptr
add \$32,$num
jnz .Lmulx4x_sub
mov 56(%rsp),%rsi # restore %rsp
mov \$1,%rax
___
$code
.=<<
___
if
(
$win64
);
movaps
(
%rsi
),
%xmm6
movaps
0x10
(
%rsi
),
%xmm7
lea
0x28
(
%rsi
),
%rsi
___
$code
.=
<<___;
mov (%rsi),%r15
mov 8(%rsi),%r14
mov 16(%rsi),%r13
mov 24(%rsi),%r12
mov 32(%rsi),%rbp
mov 40(%rsi),%rbx
lea 48(%rsi),%rsp
.Lmulx4x_epilogue:
ret
.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
___
}}}
{
my
(
$inp
,
$num
,
$tbl
,
$idx
)
=
$win64
?("
%rcx
","
%rdx
","
%r8
",
"
%r9
")
:
# Win64 order
("
%rdi
","
%rsi
","
%rdx
","
%rcx
");
# Unix order
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录