Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenHarmony
Third Party Openssl
提交
7d9cf7c0
T
Third Party Openssl
项目概览
OpenHarmony
/
Third Party Openssl
1 年多 前同步成功
通知
10
Star
18
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
Third Party Openssl
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
7d9cf7c0
编写于
17年前
作者:
A
Andy Polyakov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Eliminate conditional final subtraction in Montgomery assembler modules.
上级
55525742
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
274 addition
and
273 deletion
+274
-273
crypto/bn/asm/alpha-mont.pl
crypto/bn/asm/alpha-mont.pl
+23
-31
crypto/bn/asm/armv4-mont.pl
crypto/bn/asm/armv4-mont.pl
+21
-23
crypto/bn/asm/mips3-mont.pl
crypto/bn/asm/mips3-mont.pl
+34
-39
crypto/bn/asm/ppc-mont.pl
crypto/bn/asm/ppc-mont.pl
+26
-25
crypto/bn/asm/s390x-mont.pl
crypto/bn/asm/s390x-mont.pl
+25
-25
crypto/bn/asm/sparcv9-mont.pl
crypto/bn/asm/sparcv9-mont.pl
+15
-21
crypto/bn/asm/sparcv9a-mont.pl
crypto/bn/asm/sparcv9a-mont.pl
+19
-17
crypto/bn/asm/via-mont.pl
crypto/bn/asm/via-mont.pl
+57
-37
crypto/bn/asm/x86-mont.pl
crypto/bn/asm/x86-mont.pl
+26
-28
crypto/bn/asm/x86_64-mont.pl
crypto/bn/asm/x86_64-mont.pl
+28
-27
未找到文件。
crypto/bn/asm/alpha-mont.pl
浏览文件 @
7d9cf7c0
...
...
@@ -258,56 +258,48 @@ bn_mul_mont:
stq $hi1,16($tp)
bne $tj,.Louter
s8addq $num,sp,$
ap
mov $rp,$bp
s8addq $num,sp,$
tj # &tp[num]
mov $rp,$bp
# put rp aside
mov sp,$tp
mov 0,$hi0
bne $hi1,.Lsub
cmpult $nj,$lo1,AT
bne AT,.Lsub
.align 4
.Lcopy: ldq AT,($tp)
lda $tp,8($tp)
stq AT,($rp)
cmpult $tp,$ap,AT
stq zero,-8($tp)
nop
lda $rp,8($rp)
bne AT,.Lcopy
mov 1,v0
br .Lexit
mov sp,$ap
srl $nj,62,AT # boundary condition...
beq AT,.Lcopy # ... is met
mov 0,$hi0 # clear borrow bit
.align 4
.Lsub: ldq $lo0,($tp)
ldq $lo1,($np)
subq $lo0,$lo1,$lo1
lda $tp,8($tp)
lda $np,8($np)
subq $lo0,$lo1,$lo1 # tp[i]-np[i]
cmpult $lo0,$lo1,AT
subq $lo1,$hi0,$lo0
cmpult $lo1,$lo0,$hi0
lda $tp,8($tp)
or $hi0,AT,$hi0
lda $np,8($np)
stq $lo0,($rp)
cmpult $tp,$
ap
,v0
cmpult $tp,$
tj
,v0
lda $rp,8($rp)
bne v0,.Lsub
subq $hi1,$hi0,$hi0
subq $hi1,$hi0,$hi0
# handle upmost overflow bit
mov sp,$tp
cmpule $hi1,$hi0,AT
mov $bp,$rp
bne AT,.Lcopy
mov $bp,$rp # restore rp
and sp,$hi0,$ap
bic $bp,$hi0,$bp
bis $bp,$ap,$ap # ap=borrow?tp:rp
.align 4
.Lzap: stq zero,($tp)
cmpult $tp,$ap,AT
.Lcopy: ldq $aj,($ap) # copy or in-place refresh
lda $tp,8($tp)
bne AT,.Lzap
lda $rp,8($rp)
lda $ap,8($ap)
stq zero,-8($tp) # zap tp
cmpult $tp,$tj,AT
stq $aj,-8($rp)
bne AT,.Lcopy
mov 1,v0
.align 4
.Lexit:
.set noreorder
mov fp,sp
...
...
This diff is collapsed.
Click to expand it.
crypto/bn/asm/armv4-mont.pl
浏览文件 @
7d9cf7c0
...
...
@@ -61,7 +61,7 @@ bn_mul_mont:
cmp $num,#2
movlt r0,#0
addlt sp,sp,#2*4
blt .Lab
o
rt
blt .Labrt
stmdb sp!,{r4-r12,lr} @ save 10 registers
...
...
@@ -160,27 +160,13 @@ bn_mul_mont:
add $num,$num,#4 @ $num to point at &tp[num]
sub $aj,$num,sp @ "original" num value
mov $tp,sp @ "rewind" $tp
mov $ap,$tp @ "borrow" $ap
sub $np,$np,$aj @ "rewind" $np to &np[0]
cmp $nhi,#0 @ upmost carry
bne .Lsub
cmp $nlo,$nj @ tp[num-1]-np[num-1]
bhs .Lsub
.Lcopy: ldr $tj,[$tp]
str sp,[$tp],#4 @ zap tp
str $tj,[$rp],#4
cmp $tp,$num
bne .Lcopy
.Lexit: add sp,$num,#4 @ skip over tp[num+1]
ldmia sp!,{r4-r12,lr} @ restore registers
add sp,sp,#2*4 @ skip over {r0,r2}
mov r0,#1
.Labort:tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
movs $tj,$nj,lsr#30 @ boundary condition...
beq .Lcopy @ ... is met
subs $tj,$tj,$tj @ "clear" carry flag
.Lsub: ldr $tj,[$tp],#4
ldr $nj,[$np],#4
sbcs $tj,$tj,$nj @ tp[j]-np[j]
...
...
@@ -190,12 +176,24 @@ bn_mul_mont:
sbcs $nhi,$nhi,#0 @ upmost carry
mov $tp,sp @ "rewind" $tp
sub $rp,$rp,$aj @ "rewind" $rp
blo .Lcopy @ tp was less after all
.Lzap: str sp,[$tp],#4
and $ap,$tp,$nhi
bic $np,$rp,$nhi
orr $ap,$ap,$np @ ap=borrow?tp:rp
.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh
str sp,[$tp],#4 @ zap tp
str $tj,[$rp],#4
cmp $tp,$num
bne .Lzap
bal .Lexit
bne .Lcopy
add sp,$num,#4 @ skip over tp[num+1]
ldmia sp!,{r4-r12,lr} @ restore registers
add sp,sp,#2*4 @ skip over {r0,r2}
mov r0,#1
.Labrt: tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
.size bn_mul_mont,.-bn_mul_mont
.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
___
...
...
This diff is collapsed.
Click to expand it.
crypto/bn/asm/mips3-mont.pl
浏览文件 @
7d9cf7c0
...
...
@@ -265,27 +265,50 @@ bn_mul_mont:
addu $i,8
sltu s7,$i,$num
bnez s7,.Louter
.set noreorder
PTR_ADD $
ap,sp,$num
PTR_ADD $
tj,sp,$num # &tp[num]
move $tp,sp
move $ap,sp
bnez $hi1,.Lsub
li $hi0,0
sgeu AT,$lo1,$nj
beqz AT,.Lsub
nop
dsrl AT,$nj,62 # boundary condition...
beqz AT,.Lcopy # ... is met
li $hi0,0 # clear borrow bit
.align 4
.Lcopy: ld AT,($tp)
.Lsub: ld $lo0,($tp)
ld $lo1,($np)
PTR_ADD $tp,8
PTR_ADD $np,8
dsubu $lo1,$lo0,$lo1 # tp[i]-np[i]
sgtu AT,$lo1,$lo0
dsubu $lo0,$lo1,$hi0
sgtu $hi0,$lo0,$lo1
sd $lo0,($rp)
or $hi0,AT
sltu AT,$tp,$tj
bnez AT,.Lsub
PTR_ADD $rp,8
dsubu $hi0,$hi1,$hi0 # handle upmost overflow bit
move $tp,sp
PTR_SUB $rp,$num # restore rp
not $hi1,$hi0
and $ap,$hi0,sp
and $bp,$hi1,$rp
or $ap,$ap,$bp # ap=borrow?tp:rp
.align 4
.Lcopy: ld $aj,($ap)
PTR_ADD $ap,8
PTR_ADD $tp,8
sd AT,($rp)
sltu AT,$tp,$ap
sd zero,-8($tp)
sltu AT,$tp,$tj
sd $aj,($rp)
bnez AT,.Lcopy
PTR_ADD $rp,8
.Lexit:
ld s0,0($fp)
ld s1,8($fp)
ld s2,16($fp)
...
...
@@ -297,34 +320,6 @@ bn_mul_mont:
li v0,1
jr ra
PTR_ADD sp,$fp,64
.align 4
.Lsub: ld $lo0,($tp)
ld $lo1,($np)
dsubu $lo1,$lo0,$lo1
sgtu AT,$lo1,$lo0
dsubu $lo0,$lo1,$hi0
sgtu $hi0,$lo0,$lo1
PTR_ADD $tp,8
or $hi0,AT
PTR_ADD $np,8
sd $lo0,($rp)
sltu AT,$tp,$ap
bnez AT,.Lsub
PTR_ADD $rp,8
dsubu $hi0,$hi1,$hi0
move $tp,sp
sgtu AT,$hi0,$hi1
bnez AT,.Lcopy
PTR_SUB $rp,$num
.align 4
.Lzap: sd zero,($tp)
sltu AT,$tp,$ap
bnez AT,.Lzap
PTR_ADD $tp,8
b .Lexit
nop
.set reorder
END(bn_mul_mont)
.rdata
...
...
This diff is collapsed.
Click to expand it.
crypto/bn/asm/ppc-mont.pl
浏览文件 @
7d9cf7c0
...
...
@@ -2,8 +2,9 @@
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# April 2006
...
...
@@ -42,6 +43,7 @@ if ($output =~ /32\-mont\.s/) {
$UMULL
=
"
mullw
";
# unsigned multiply low
$UMULH
=
"
mulhwu
";
# unsigned multiply high
$UCMP
=
"
cmplw
";
# unsigned compare
$SHRI
=
"
srwi
";
# unsigned shift right by immediate
$PUSH
=
$ST
;
$POP
=
$LD
;
}
elsif
(
$output
=~
/64\-mont\.s/
)
{
...
...
@@ -62,6 +64,7 @@ if ($output =~ /32\-mont\.s/) {
$UMULL
=
"
mulld
";
# unsigned multiply low
$UMULH
=
"
mulhdu
";
# unsigned multiply high
$UCMP
=
"
cmpld
";
# unsigned compare
$SHRI
=
"
srdi
";
# unsigned shift right by immediate
$PUSH
=
$ST
;
$POP
=
$LD
;
}
else
{
die
"
nonsense
$output
";
}
...
...
@@ -264,24 +267,37 @@ Linner:
addi $i,$i,$BNSZ
ble- Louter
$SHRI. $nj,$nj,$BITS-2 ; check boundary condition
addi $num,$num,2 ; restore $num
subfc $j,$j,$j ; j=0 and "clear" XER[CA]
addi $tp,$sp,$FRAME
addi $ap,$sp,$FRAME
mtctr $num
beq Lcopy ; boundary condition is met
.align 4
Lsub: $LDX $tj,$tp,$j
$LDX $nj,$np,$j
subfe $aj,$nj,$tj ; tp[j]-np[j]
$STX $aj,$rp,$j
addi $j,$j,$BNSZ
bdnz- Lsub
li $j,0
mtctr $num
subfe $ovf,$j,$ovf ; handle upmost overflow bit
and $ap,$tp,$ovf
andc $np,$rp,$ovf
or $ap,$ap,$np ; ap=borrow?tp:rp
subfc. $ovf,$j,$ovf ; sets XER[CA]
bne Lsub
$UCMP $hi1,$nj
bge Lsub
.align 4
Lcopy:
$LDX $tj,$
t
p,$j
Lcopy:
; copy or in-place refresh
$LDX $tj,$
a
p,$j
$STX $tj,$rp,$j
$STX $j,$tp,$j ; zap at once
addi $j,$j,$BNSZ
bdnz- Lcopy
Lexit:
$POP r14,`4*$SIZE_T`($sp)
$POP r15,`5*$SIZE_T`($sp)
$POP r16,`6*$SIZE_T`($sp)
...
...
@@ -298,22 +314,7 @@ Lexit:
li r3,1
blr
.long 0
.align 4
Lsub: $LDX $tj,$tp,$j
$LDX $nj,$np,$j
subfe $tj,$nj,$tj ; tp[j]-np[j]
$STX $tj,$rp,$j
addi $j,$j,$BNSZ
bdnz- Lsub
li $j,0
subfe. $ovf,$j,$ovf
mtctr $num
bne Lcopy
.align 4
Lzap: $STX $j,$tp,$j
addi $j,$j,$BNSZ
bdnz- Lzap
b Lexit
.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
___
$code
=~
s/\`([^\`]*)\`/eval $1/g
em
;
...
...
This diff is collapsed.
Click to expand it.
crypto/bn/asm/s390x-mont.pl
浏览文件 @
7d9cf7c0
...
...
@@ -176,45 +176,45 @@ bn_mul_mont:
___
undef
$bi
;
$count
=
$
ap
;
undef
$a
p
;
$count
=
$
bp
;
undef
$b
p
;
$code
.=
<<___;
lg $rp,16+16($fp) # reincarnate rp
la $ap,8($fp)
lgr $j,$num
ltgr $AHI,$AHI
jnz .Lsub # upmost overflow bit is not zero
#slg $NHI,-8($np) # tp[num-1]-np[num-1]
lghi $count,-8 # buggy assembler
slg $NHI,0($count,$np) # buggy assembler
jnle .Lsub # branch if not borrow
.Lcopy: lg $alo,8($j,$fp)
stg $j,8($j,$fp)
stg $alo,0($j,$rp)
aghi $j,8
jnz .Lcopy
.Lexit:
lmg %r6,%r15,16+48($fp)
lghi %r2,1 # signal "processed"
br %r14
#lg $nhi,-8($np) # buggy assembler
lghi $count,-8 # buggy assembler
lg $nhi,0($count,$np) # buggy assembler
srag $nhi,$nhi,62 # boundary condition...
jz .Lcopy # ... is met
.Lsub:
lcgr $count,$num
lcgr $count,$num
sra $count,3 # incidentally clears "borrow"
.Lsubloop:
lg $alo,8($j,$fp)
.Lsub: lg $alo,0($j,$ap)
slbg $alo,0($j,$np)
stg $alo,0($j,$rp)
la $j,8($j)
brct $count,.Lsub
loop
brct $count,.Lsub
lghi $ahi,0
slbgr $AHI,$ahi
slbgr $AHI,$ahi # handle upmost carry
ngr $ap,$AHI
lghi $np,-1
xgr $np,$AHI
ngr $np,$rp
ogr $ap,$np # ap=borrow?tp:rp
lgr $j,$num
jle .Lcopy # branch if borrow
.Lzap: stg $j,8($j,$fp)
.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
stg $j,8($j,$fp) # zap tp
stg $alo,0($j,$rp)
aghi $j,8
jnz .Lzap
j .Lexit
jnz .Lcopy
lmg %r6,%r15,16+48($fp)
lghi %r2,1 # signal "processed"
br %r14
.size bn_mul_mont,.-bn_mul_mont
.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
...
...
This diff is collapsed.
Click to expand it.
crypto/bn/asm/sparcv9-mont.pl
浏览文件 @
7d9cf7c0
...
...
@@ -2,8 +2,9 @@
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# December 2005
...
...
@@ -254,44 +255,36 @@ $fname:
.Ltail:
add $np,$num,$np
add $rp,$num,$rp
cmp $car2,0 ! clears %icc.c
bne,pn %icc,.Lsub
mov $tp,$ap
sub %g0,$num,%o7 ! k=-num
cmp $car1,$npj ! compare top-most $tp and $np words
b
cs,pt %icc,.Lcopy ! %icc.c is clean if not taken
nop
srl $npj,30,%o0 ! boundary condition...
b
rz,pn %o0,.Lcopy ! ... is met
subcc %g0,%g0,%g0 ! clear %icc.c
.align 16,0x1000000
.Lsub:
ld [$tp+%o7],%o0
ld [$np+%o7],%o1
subccc %o0,%o1,%o1
subccc %o0,%o1,%o1
! tp[j]-np[j]
st %o1,[$rp+%o7]
add %o7,4,%o7
brnz %o7,.Lsub
nop
subccc $car2,0,$car2
bcc %icc,.Lzap
subc $car2,0,$car2 ! handle upmost overflow bit
and $tp,$car2,$ap
andn $rp,$car2,$np
or $ap,$np,$ap
sub %g0,$num,%o7
.align 16,0x1000000
.Lcopy:
ld [$tp+%o7],%o0
ld [$ap+%o7],%o0 ! copy or in-place refresh
st %g0,[$tp+%o7] ! zap tp
st %o0,[$rp+%o7]
add %o7,4,%o7
brnz %o7,.Lcopy
nop
ba .Lzap
sub %g0,$num,%o7
.align 32
.Lzap:
st %g0,[$tp+%o7]
add %o7,4,%o7
brnz %o7,.Lzap
nop
mov 1,%i0
ret
restore
...
...
@@ -609,6 +602,7 @@ $code.=<<___;
add $tp,8,$tp
.type $fname,#function
.size $fname,(.-$fname)
.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
___
$code
=~
s/\`([^\`]*)\`/eval($1)/g
em
;
print
$code
;
...
...
This diff is collapsed.
Click to expand it.
crypto/bn/asm/sparcv9a-mont.pl
浏览文件 @
7d9cf7c0
...
...
@@ -121,7 +121,6 @@ $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
$ASI_FL16_P
=
0xD2
;
# magic ASI value to engage 16-bit FP load
$code
=
<<___;
.ident "UltraSPARC Montgomery multiply by <appro\@fy.chalmers.se>"
.section ".text",#alloc,#execinstr
.global $fname
...
...
@@ -799,17 +798,14 @@ $fname:
bnz %icc,.Louter
nop
sub %g0,$num,%o7 ! n=-num
cmp $carry,0 ! clears %icc.c
bne,pn %icc,.Lsub
add $tp,8,$tp ! adjust tp to point at the end
ld [$tp-8],%o0
ld [$np-4],%o1
cmp %o0,%o1 ! compare topmost words
bcs,pt %icc,.Lcopy ! %icc.c is clean if not taken
nop
subcc %g0,%g0,%g0 ! clear %icc.c
add $tp,8,$tp ! adjust tp to point at the end
srl %o1,30,%o1 ! boundary condition...
orn %g0,%g0,%g4
brz,pn %o1,.Lcopy ! ... is met
sub %g0,$num,%o7 ! n=-num
.align 32,0x1000000
.Lsub:
ldx [$tp+%o7],%o0
...
...
@@ -824,24 +820,30 @@ $fname:
add %o7,8,%o7
brnz,pt %o7,.Lsub
st %o3,[%g1+4]
subccc $carry,0,$carry
bcc,pt %icc,.Lzap
subc $carry,0,%g4
sub %g0,$num,%o7 ! n=-num
.align
16
,0x1000000
.align
32
,0x1000000
.Lcopy:
ldx [$tp+%o7],%o0
srlx %o0,32,%o1
add $rp,%o7,%g1
ld [%g1+0],%o2
ld [%g1+4],%o3
stx %g0,[$tp+%o7]
and %o0,%g4,%o0
srlx %o0,32,%o1
andn %o2,%g4,%o2
andn %o3,%g4,%o3
or %o2,%o0,%o0
or %o3,%o1,%o1
st %o0,[%g1+0]
add %o7,8,%o7
brnz,pt %o7,.Lcopy
st %o1,[%g1+4]
sub %g0,$num,%o7 ! n=-num
.align 32
.align 32
,0x1000000
.Lzap:
stx %g0,[$tp+%o7]
stx %g0,[$ap_l+%o7]
stx %g0,[$ap_h+%o7]
stx %g0,[$np_l+%o7]
...
...
This diff is collapsed.
Click to expand it.
crypto/bn/asm/via-mont.pl
浏览文件 @
7d9cf7c0
...
...
@@ -77,7 +77,8 @@
# - in terms of absolute performance it delivers approximately as much
# as modern out-of-order 32-bit cores [again, for longer keys].
push
(
@INC
,"
.
","
../../perlasm
");
$
0
=~
m/(.*[\/\\])[^\/\\]+$/
;
$dir
=
$
1
;
push
(
@INC
,"
${dir}
","
${dir}
../../perlasm
");
require
"
x86asm.pl
";
&asm_init
(
$ARGV
[
0
],"
via-mont.pl
");
...
...
@@ -100,7 +101,7 @@ $sp=&DWP(28,"esp");
# &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num]
# &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num]
# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num]
# &DWP(64+(4*$num+$pad)*
2
,"esp") # padded copy of np[num]
# &DWP(64+(4*$num+$pad)*
3
,"esp") # padded copy of np[num]
# Note that SDK suggests to unconditionally allocate 2K per vector. This
# has quite an impact on performance. It naturally depends on key length,
# but to give an example 1024 bit private RSA key operations suffer >30%
...
...
@@ -115,7 +116,7 @@ $sp=&DWP(28,"esp");
&jnz
(
&label
("
leave
"));
# num % 4 != 0
&cmp
("
ecx
",
8
);
&jb
(
&label
("
leave
"));
# num < 8
&cmp
("
ecx
",
256
);
&cmp
("
ecx
",
1024
);
&ja
(
&label
("
leave
"));
# num > 1024
&pushf
();
...
...
@@ -148,74 +149,91 @@ $sp=&DWP(28,"esp");
&lea
("
ebp
",
&DWP
(
-
$pad
,"
ecx
"));
&shr
("
ebp
",
2
);
# restore original num value in ebp
&add
("
ecx
",
32
/4); # (4 vectors + 32 byte scratch)/
4
&xor
("
eax
","
eax
");
&mov
("
ecx
","
ebp
");
&lea
("
ecx
",
&DWP
((
32
+
$pad
)
/
4
,"
ecx
"));
# padded tp + scratch
&data_byte
(
0xf3
,
0xab
);
# rep stosl, bzero
&mov
("
ecx
","
ebp
");
&lea
("
edi
",
&DWP
(
64
+
$pad
,"
esp
","
ecx
",
4
));
# pointer to ap copy
&mov
(
$A
,"
edi
");
&data_byte
(
0xf3
,
0xa5
);
# rep movsl, memcpy
&mov
("
ecx
",
$pad
/
4
);
&data_byte
(
0xf3
,
0xab
);
# rep stosl, bzero pad
# edi points at the end of padded ap copy...
# edi points at the end of ap copy...
&mov
("
ecx
","
ebp
");
&add
("
edi
",
$pad
);
# skip padding to point at bp copy
&mov
("
esi
","
ebx
");
&mov
(
$B
,"
edi
");
&data_byte
(
0xf3
,
0xa5
);
# rep movsl, memcpy
&mov
("
ecx
",
$pad
/
4
);
&data_byte
(
0xf3
,
0xab
);
# rep stosl, bzero pad
# edi points at the end of padded bp copy...
# edi points at the end of bp copy...
&mov
("
ecx
","
ebp
");
&add
("
edi
",
$pad
);
# skip padding to point at np copy
&mov
("
esi
","
edx
");
&mov
(
$M
,"
edi
");
&data_byte
(
0xf3
,
0xa5
);
# rep movsl, memcpy
&mov
("
ecx
",
$pad
/
4
);
&data_byte
(
0xf3
,
0xab
);
# rep stosl, bzero pad
# edi points at the end of padded np copy...
# let magic happen...
&mov
("
ecx
","
ebp
");
&mov
("
esi
","
esp
");
&xor
("
eax
","
eax
");
&shl
("
ecx
",
5
);
# convert word counter to bit counter
&align
(
4
);
&data_byte
(
0xf3
,
0x0f
,
0xa6
,
0xc0
);
# rep montmul
&mov
("
ecx
","
ebp
");
&xor
("
edx
","
edx
");
# i=0
&lea
("
esi
",
&DWP
(
64
,"
esp
"));
# tp
# edi still points at the end of np copy...
&xor
("
edx
","
edx
");
# i=0
&lea
("
esi
",
&DWP
(
64
,"
esp
"));
# tp
# edi still points at the end of padded np copy...
&mov
("
eax
",
&DWP
(
-
4
-
$pad
,"
edi
"));
# np[num-1]
&neg
("
ebp
");
&lea
("
ebp
",
&DWP
(
0
,"
edi
","
ebp
",
4
));
# so just "rewind"
&mov
("
edi
",
$rp
);
# restore rp
&mov
("
ebx
",
&DWP
(
0
,"
esi
","
ecx
",
4
));
# upmost overflow bit
&cmp
("
ebx
",
0
);
# clears CF unconfitionally
&jnz
(
&label
("
sub
"));
&mov
("
eax
",
&DWP
(
-
4
,"
esi
","
ecx
",
4
));
&cmp
("
eax
",
&DWP
(
-
4
,"
ebp
","
ecx
",
4
));
# tp[num-1]-np[num-1]?
&jae
(
&label
("
sub
"));
# if taken CF is cleared
&set_label
("
copy
",
4
);
&mov
("
ebx
","
ecx
");
&data_byte
(
0xf3
,
0xa5
);
# rep movsl
&mov
("
ecx
","
ebx
");
&jmp
(
&label
("
zap
"));
&set_label
("
sub
",
16
);
&lea
("
ebp
",
&DWP
(
-
$pad
,"
edi
","
ebp
",
4
));
# so just "rewind"
&mov
("
edi
",
$rp
);
# restore rp
&shr
("
eax
",
30
);
# boundary condition...
&jz
(
&label
("
copy
"));
# ... is met
&xor
("
edx
","
edx
");
# clear CF
&set_label
("
sub
",
8
);
&mov
("
eax
",
&DWP
(
0
,"
esi
","
edx
",
4
));
&sbb
("
eax
",
&DWP
(
0
,"
ebp
","
edx
",
4
));
&mov
(
&DWP
(
0
,"
edi
","
edx
",
4
),"
eax
");
# rp[i]=tp[i]-np[i]
&lea
("
edx
",
&DWP
(
1
,"
edx
"));
# i++
&dec
("
ecx
");
# doesn't affect CF!
&jg
(
&label
("
sub
"));
&sbb
("
ebx
",
0
);
# upmost overflow is still there
&mov
("
ecx
","
edx
");
&jc
(
&label
("
copy
"));
&loop
(
&label
("
sub
"));
# doesn't affect CF!
&mov
("
eax
",
&DWP
(
0
,"
esi
","
edx
",
4
));
# upmost overflow bit
&sbb
("
eax
",
0
);
&and
("
esi
","
eax
");
¬
("
eax
");
&mov
("
ebp
","
edi
");
&and
("
ebp
","
eax
");
&or
("
esi
","
ebp
");
# tp=carry?tp:rp
&mov
("
ecx
","
edx
");
# num
&xor
("
edx
","
edx
");
# i=0
&set_label
("
copy
",
8
);
&mov
("
eax
",
&DWP
(
0
,"
esi
","
edx
",
4
));
&mov
(
&DWP
(
64
,"
esp
","
edx
",
4
),"
ecx
");
# zap tp
&mov
(
&DWP
(
0
,"
edi
","
edx
",
4
),"
eax
");
&lea
("
edx
",
&DWP
(
1
,"
edx
"));
# i++
&loop
(
&label
("
copy
"));
&set_label
("
zap
",
4
);
&mov
("
ebp
",
$sp
);
&xor
("
eax
","
eax
");
&lea
("
ecx
",
&DWP
(
64
/
4
+
$pad
,"","
ecx
",
4
));
# size of frame divided by 4
&mov
("
edi
","
esp
");
&mov
("
ecx
",
64
/
4
);
&mov
("
edi
","
esp
");
# zap frame including scratch area
&data_byte
(
0xf3
,
0xab
);
# rep stosl, bzero
# zap copies of ap, bp and np
&lea
("
edi
",
&DWP
(
64
+
$pad
,"
esp
","
edx
",
4
));
# pointer to ap
&lea
("
ecx
",
&DWP
(
3
*$pad
/
4
,"
edx
","
edx
",
2
));
&data_byte
(
0xf3
,
0xab
);
# rep stosl, bzero
&mov
("
esp
","
ebp
");
...
...
@@ -224,4 +242,6 @@ $sp=&DWP(28,"esp");
&set_label
("
leave
");
&function_end
(
$func
);
&asciz
("
Padlock Montgomery Multiplication, CRYPTOGAMS by <appro
\@
openssl.org>
");
&asm_finish
();
This diff is collapsed.
Click to expand it.
crypto/bn/asm/x86-mont.pl
浏览文件 @
7d9cf7c0
...
...
@@ -41,7 +41,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
$i
=
"
edx
";
$j
=
"
ecx
";
$ap
=
"
esi
";
$ap
=
"
esi
";
$tp
=
"
esi
";
# overlapping variables!!!
$rp
=
"
edi
";
$bp
=
"
edi
";
# overlapping variables!!!
$np
=
"
ebp
";
$num
=
"
ebx
";
...
...
@@ -551,41 +551,39 @@ $sbit=$num;
}
&set_label
("
common_tail
",
16
);
&mov
(
$np
,
$_np
);
&mov
("
esi
",
&DWP
(
$frame
+
4
,"
esp
",
$num
,
4
));
# load upmost overflow bit
&mov
(
$np
,
$_np
);
# load modulus pointer
&mov
(
$rp
,
$_rp
);
# load result pointer
# [$ap and $bp are zapped]
&xor
(
$i
,
$i
);
# i=0
&lea
(
$tp
,
&DWP
(
$frame
,"
esp
"));
# [$ap and $bp are zapped]
&mov
("
eax
",
&DWP
(
0
,
$np
,
$num
,
4
));
# np[num-1]
&shr
("
eax
",
30
);
# check for boundary condition
&jz
(
&label
("
copy
"));
&mov
("
eax
",
&DWP
(
0
,
$tp
));
# tp[0]
&mov
(
$j
,
$num
);
# j=num-1
&cmp
("
esi
",
0
);
# clears CF unconditionally
&jnz
(
&label
("
sub
"));
&mov
("
eax
",
&DWP
(
$frame
,"
esp
",
$j
,
4
));
&cmp
("
eax
",
&DWP
(
0
,
$np
,
$j
,
4
));
# tp[num-1]-np[num-1]?
&jae
(
&label
("
sub
"));
# if taken CF is cleared
&set_label
("
copy
",
16
);
&mov
("
eax
",
&DWP
(
$frame
,"
esp
",
$j
,
4
));
&mov
(
&DWP
(
0
,
$rp
,
$j
,
4
),"
eax
");
# rp[i]=tp[i]
&mov
(
&DWP
(
$frame
,"
esp
",
$j
,
4
),
$j
);
# zap temporary vector
&dec
(
$j
);
&jge
(
&label
("
copy
"));
&jmp
(
&label
("
exit
"));
&xor
(
$i
,
$i
);
# i=0 and clear CF!
&set_label
("
sub
",
16
);
&mov
("
eax
",
&DWP
(
$frame
,"
esp
",
$i
,
4
));
&sbb
("
eax
",
&DWP
(
0
,
$np
,
$i
,
4
));
&mov
(
&DWP
(
0
,
$rp
,
$i
,
4
),"
eax
");
# rp[i]=tp[i]-np[i]
&lea
(
$i
,
&DWP
(
1
,
$i
));
# i++
&dec
(
$j
);
# doesn't affect CF!
&mov
("
eax
",
&DWP
(
4
,
$tp
,
$i
,
4
));
# tp[i+1]
&lea
(
$i
,
&DWP
(
1
,
$i
));
# i++
&jge
(
&label
("
sub
"));
&mov
(
$j
,
$num
);
# j=num-1
&sbb
("
esi
",
0
);
# esi holds upmost overflow bit
&jc
(
&label
("
copy
"));
&set_label
("
zap
",
8
);
&mov
(
&DWP
(
$frame
,"
esp
",
$j
,
4
),
$i
);
# zap temporary vector
&dec
(
$j
);
&jge
(
&label
("
zap
"));
&set_label
("
exit
",
8
);
&sbb
("
eax
",
0
);
# handle upmost overflow bit
&and
(
$tp
,"
eax
");
¬
("
eax
");
&mov
(
$np
,
$rp
);
&and
(
$np
,"
eax
");
&or
(
$tp
,
$np
);
# tp=carry?tp:rp
&set_label
("
copy
",
16
);
# copy or in-place refresh
&mov
("
eax
",
&DWP
(
0
,
$tp
,
$num
,
4
));
&mov
(
&DWP
(
0
,
$rp
,
$num
,
4
),"
eax
");
# rp[i]=tp[i]
&mov
(
&DWP
(
$frame
,"
esp
",
$num
,
4
),
$j
);
# zap temporary vector
&dec
(
$num
);
&jge
(
&label
("
copy
"));
&mov
("
esp
",
$_sp
);
# pull saved stack pointer
&mov
("
eax
",
1
);
&set_label
("
just_leave
");
...
...
This diff is collapsed.
Click to expand it.
crypto/bn/asm/x86_64-mont.pl
浏览文件 @
7d9cf7c0
...
...
@@ -59,6 +59,7 @@ bn_mul_mont:
neg %rax
lea (%rsp,%rax,8),%rsp # tp=alloca(8*(num+2))
and \$-1024,%rsp # minimize TLB usage
mov %rbp,8(%rsp,$num,8) # tp[num+1]=%rsp
mov %rdx,$bp # $bp reassigned, remember?
...
...
@@ -166,22 +167,38 @@ bn_mul_mont:
cmp $num,$i
jl .Louter
xor $i,$i # i=0
mov -8($np,$num,8),%rax # np[num-1]
lea (%rsp),$ap # borrow ap for tp
shr \$62,%rax # check for boundary condition
jz .Lcopy
mov ($ap),%rax # tp[0]
lea -1($num),$j # j=num-1
cmp \$0,%rdx # %rdx still holds upmost overflow bit
jnz .Lsub # CF is cleared by compare with 0
mov (%rsp,$j,8),%rax
cmp ($np,$j,8),%rax # tp[num-1]-np[num-1]
jae .Lsub # if taken CF was cleared by above cmp
.align 4
.Lcopy:
mov (%rsp,$j,8),%rax
xor $i,$i # i=0 and clear CF!
jmp .Lsub
.align 16
.Lsub: sbb ($np,$i,8),%rax
mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
dec $j # doesn't affect CF!
mov 8($ap,$i,8),%rax # tp[i+1]
lea 1($i),$i # i++
jge .Lsub
sbb \$0,%rax # handle upmost overflow bit
and %rax,$ap
not %rax
mov $rp,$np
and %rax,$np
lea -1($num),$j
or $np,$ap # ap=borrow?tp:rp
.align 16
.Lcopy: # copy or in-place refresh
mov ($ap,$j,8),%rax
mov %rax,($rp,$j,8) # rp[i]=tp[i]
mov $i,(%rsp,$j,8) # zap temporary vector
dec $j
jge .Lcopy
.align 4
.Lexit:
mov 8(%rsp,$num,8),%rsp # restore %rsp
mov \$1,%rax
pop %r15
...
...
@@ -191,22 +208,6 @@ bn_mul_mont:
pop %rbp
pop %rbx
ret
.align 16
.Lsub: mov (%rsp,$i,8),%rax
sbb ($np,$i,8),%rax
mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[j]
lea 1($i),$i # i++
dec $j # doesn't affect CF!
jge .Lsub
lea -1($num),$j # j=num-1
sbb \$0,%rdx
jc .Lcopy # tp was less than np
.align 4
.Lzap: mov $i,(%rsp,$j,8) # zap temporary vector
dec $j
jge .Lzap
jmp .Lexit
.size bn_mul_mont,.-bn_mul_mont
.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
___
...
...
This diff is collapsed.
Click to expand it.
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录
新手
引导
客服
返回
顶部