提交 3f66f204 编写于 作者: A Andy Polyakov

x86_64-mont.pl: minor optimization.

上级 25348918
......@@ -817,15 +817,14 @@ bn_sqr4x_mont:
xor $A0[1],$A0[1]
add $A1[0],$A0[0]
lea 16($j),$j
adc \$0,$A0[1]
mul $a0 # a[5]*a[2]
add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
mov $ai,%rax
adc %rdx,$A0[1]
mov $A0[0],-8($tptr,$j) # t[5]
mov $A0[0],8($tptr,$j) # t[5]
mov ($aptr,$j),$ai # a[6]
mov 16($aptr,$j),$ai # a[6]
xor $A1[0],$A1[0]
mul $a1 # a[5]*a[3]
add %rax,$A1[1] # a[5]*a[3]+t[6]
......@@ -839,10 +838,10 @@ bn_sqr4x_mont:
add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6]
mov $ai,%rax # a[3]
adc %rdx,$A0[0]
mov $A0[1],($tptr,$j) # t[6]
mov $A0[1],16($tptr,$j) # t[6]
mov 8($aptr,$j),$ai # a[7]
mov 24($aptr,$j),$ai # a[7]
xor $A1[1],$A1[1]
mul $a1 # a[6]*a[5]
add %rax,$A1[0] # a[6]*a[5]+t[7]
......@@ -851,7 +850,7 @@ bn_sqr4x_mont:
xor $A0[1],$A0[1]
add $A1[0],$A0[0]
lea 16($j),$j
lea 32($j),$j
adc \$0,$A0[1]
mul $a0 # a[7]*a[4]
add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6]
......@@ -962,7 +961,7 @@ bn_sqr4x_mont:
add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
mov $ai,%rax
adc %rdx,$A0[1]
mov $A0[0],-8($tptr,$j) # t[5]
mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below
cmp \$0,$j
jne .Lsqr4x_inner
......@@ -974,8 +973,8 @@ bn_sqr4x_mont:
add %rax,$A1[1]
adc %rdx,$A1[0]
mov $A1[1],($tptr) # t[6]
mov $A1[0],8($tptr) # t[7]
mov $A1[1],($tptr) # t[6], "preloaded t[2]" below
mov $A1[0],8($tptr) # t[7], "preloaded t[3]" below
add \$16,$i
jnz .Lsqr4x_outer
......@@ -988,16 +987,15 @@ bn_sqr4x_mont:
mov -16($aptr),$ai # a[2]
mov %rax,$a1
mov -24($tptr),$A0[0] # t[1]
xor $A0[1],$A0[1]
mul $a0 # a[1]*a[0]
add %rax,$A0[0] # a[1]*a[0]+t[1]
add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1]
mov $ai,%rax # a[2]
adc %rdx,$A0[1]
mov $A0[0],-24($tptr) # t[1]
xor $A0[0],$A0[0]
add -16($tptr),$A0[1] # a[2]*a[0]+t[2]
add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2]
adc \$0,$A0[0]
mul $a0 # a[2]*a[0]
add %rax,$A0[1]
......@@ -1005,18 +1003,15 @@ bn_sqr4x_mont:
adc %rdx,$A0[0]
mov $A0[1],-16($tptr) # t[2]
xor $A1[0],$A1[0]
mov -8($aptr),$ai # a[3]
xor $A1[1],$A1[1]
add -8($tptr),$A1[0]
adc \$0,$A1[1]
mul $a1 # a[2]*a[1]
add %rax,$A1[0] # a[2]*a[1]+t[3]
add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3]
mov $ai,%rax
adc %rdx,$A1[1]
adc \$0,%rdx
xor $A0[1],$A0[1]
add $A1[0],$A0[0]
mov %rdx,$A1[1]
adc \$0,$A0[1]
mul $a0 # a[3]*a[0]
add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册