提交 9e557ab2 编写于 作者: A Andy Polyakov

ecp_nistz256-x86_64.pl: fix occasional failures.

RT: 3607
Reviewed-by: NAdam Langley <agl@google.com>
Reviewed-by: NEmilia Kasper <emilia@openssl.org>
上级 2c60925d
...@@ -31,15 +31,16 @@ ...@@ -31,15 +31,16 @@
# Further optimization by <appro@openssl.org>: # Further optimization by <appro@openssl.org>:
# #
# this/original # this/original
# Opteron +8-33% # Opteron +12-49%
# Bulldozer +10-30% # Bulldozer +14-45%
# P4 +14-38% # P4 +18-46%
# Westmere +8-23% # Westmere +12-34%
# Sandy Bridge +8-24% # Sandy Bridge +9-35%
# Ivy Bridge +7-25% # Ivy Bridge +9-35%
# Haswell +5-25% # Haswell +8-37%
# Atom +10-32% # Broadwell +18-58%
# VIA Nano +37-130% # Atom +15-50%
# VIA Nano +43-160%
# #
# Ranges denote minimum and maximum improvement coefficients depending # Ranges denote minimum and maximum improvement coefficients depending
# on benchmark. Lower coefficients are for ECDSA sign, relatively # on benchmark. Lower coefficients are for ECDSA sign, relatively
...@@ -550,28 +551,20 @@ __ecp_nistz256_mul_montq: ...@@ -550,28 +551,20 @@ __ecp_nistz256_mul_montq:
# and add the result to the acc. # and add the result to the acc.
# Due to the special form of p256 we do some optimizations # Due to the special form of p256 we do some optimizations
# #
# acc[0] x p256[0] = acc[0] x 2^64 - acc[0] # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
# then we add acc[0] and get acc[0] x 2^64 # then we add acc[0] and get acc[0] x 2^96
mulq $poly1
xor $t0, $t0
add $acc0, $acc1 # +=acc[0]*2^64
adc \$0, %rdx
add %rax, $acc1
mov $acc0, %rax
# acc[0] x p256[2] = 0
adc %rdx, $acc2
adc \$0, $t0
mov $acc0, $t1
shl \$32, $acc0
mulq $poly3 mulq $poly3
xor $acc0, $acc0 shr \$32, $t1
add $t0, $acc3 add $acc0, $acc1 # +=acc[0]<<96
adc \$0, %rdx adc $t1, $acc2
add %rax, $acc3 adc %rax, $acc3
mov 8*1($b_ptr), %rax mov 8*1($b_ptr), %rax
adc %rdx, $acc4 adc %rdx, $acc4
adc \$0, $acc5 adc \$0, $acc5
xor $acc0, $acc0
######################################################################## ########################################################################
# Multiply by b[1] # Multiply by b[1]
...@@ -608,23 +601,17 @@ __ecp_nistz256_mul_montq: ...@@ -608,23 +601,17 @@ __ecp_nistz256_mul_montq:
######################################################################## ########################################################################
# Second reduction step # Second reduction step
mulq $poly1 mov $acc1, $t1
xor $t0, $t0 shl \$32, $acc1
add $acc1, $acc2
adc \$0, %rdx
add %rax, $acc2
mov $acc1, %rax
adc %rdx, $acc3
adc \$0, $t0
mulq $poly3 mulq $poly3
xor $acc1, $acc1 shr \$32, $t1
add $t0, $acc4 add $acc1, $acc2
adc \$0, %rdx adc $t1, $acc3
add %rax, $acc4 adc %rax, $acc4
mov 8*2($b_ptr), %rax mov 8*2($b_ptr), %rax
adc %rdx, $acc5 adc %rdx, $acc5
adc \$0, $acc0 adc \$0, $acc0
xor $acc1, $acc1
######################################################################## ########################################################################
# Multiply by b[2] # Multiply by b[2]
...@@ -661,23 +648,17 @@ __ecp_nistz256_mul_montq: ...@@ -661,23 +648,17 @@ __ecp_nistz256_mul_montq:
######################################################################## ########################################################################
# Third reduction step # Third reduction step
mulq $poly1 mov $acc2, $t1
xor $t0, $t0 shl \$32, $acc2
add $acc2, $acc3
adc \$0, %rdx
add %rax, $acc3
mov $acc2, %rax
adc %rdx, $acc4
adc \$0, $t0
mulq $poly3 mulq $poly3
xor $acc2, $acc2 shr \$32, $t1
add $t0, $acc5 add $acc2, $acc3
adc \$0, %rdx adc $t1, $acc4
add %rax, $acc5 adc %rax, $acc5
mov 8*3($b_ptr), %rax mov 8*3($b_ptr), %rax
adc %rdx, $acc0 adc %rdx, $acc0
adc \$0, $acc1 adc \$0, $acc1
xor $acc2, $acc2
######################################################################## ########################################################################
# Multiply by b[3] # Multiply by b[3]
...@@ -714,20 +695,14 @@ __ecp_nistz256_mul_montq: ...@@ -714,20 +695,14 @@ __ecp_nistz256_mul_montq:
######################################################################## ########################################################################
# Final reduction step # Final reduction step
mulq $poly1 mov $acc3, $t1
#xor $t0, $t0 shl \$32, $acc3
add $acc3, $acc4
adc \$0, %rdx
add %rax, $acc4
mov $acc3, %rax
adc %rdx, $acc5
#adc \$0, $t0 # doesn't overflow
mulq $poly3 mulq $poly3
#add $t0, $acc0 shr \$32, $t1
#adc \$0, %rdx add $acc3, $acc4
adc $t1, $acc5
mov $acc4, $t0 mov $acc4, $t0
add %rax, $acc0 adc %rax, $acc0
adc %rdx, $acc1 adc %rdx, $acc1
mov $acc5, $t1 mov $acc5, $t1
adc \$0, $acc2 adc \$0, $acc2
...@@ -740,14 +715,14 @@ __ecp_nistz256_mul_montq: ...@@ -740,14 +715,14 @@ __ecp_nistz256_mul_montq:
sbb \$0, $acc0 # .Lpoly[2] sbb \$0, $acc0 # .Lpoly[2]
mov $acc1, $t3 mov $acc1, $t3
sbb $poly3, $acc1 # .Lpoly[3] sbb $poly3, $acc1 # .Lpoly[3]
neg $acc2 sbb \$0, $acc2
cmovnc $t0, $acc4 cmovc $t0, $acc4
cmovnc $t1, $acc5 cmovc $t1, $acc5
mov $acc4, 8*0($r_ptr) mov $acc4, 8*0($r_ptr)
cmovnc $t2, $acc0 cmovc $t2, $acc0
mov $acc5, 8*1($r_ptr) mov $acc5, 8*1($r_ptr)
cmovnc $t3, $acc1 cmovc $t3, $acc1
mov $acc0, 8*2($r_ptr) mov $acc0, 8*2($r_ptr)
mov $acc1, 8*3($r_ptr) mov $acc1, 8*3($r_ptr)
...@@ -897,89 +872,62 @@ __ecp_nistz256_sqr_montq: ...@@ -897,89 +872,62 @@ __ecp_nistz256_sqr_montq:
########################################## ##########################################
# Now the reduction # Now the reduction
# First iteration # First iteration
mulq $a_ptr mov $acc0, $t0
#xor $t0, $t0 shl \$32, $acc0
add $acc0, $acc1
adc \$0, %rdx
add %rax, $acc1
mov $acc0, %rax
adc %rdx, $acc2 # doesn't overflow
#adc \$0, $t0
mulq $t1 mulq $t1
xor $acc0, $acc0 shr \$32, $t0
#add $t0, $acc3 add $acc0, $acc1 # +=acc[0]<<96
#adc \$0, %rdx adc $t0, $acc2
add %rax, $acc3 adc %rax, $acc3
mov $acc1, %rax mov $acc1, %rax
adc %rdx, $acc4 adc \$0, %rdx
adc \$0, $acc0
########################################## ##########################################
# Second iteration # Second iteration
mulq $a_ptr mov $acc1, $t0
#xor $t0, $t0 shl \$32, $acc1
add $acc1, $acc2 mov %rdx, $acc0
adc \$0, %rdx
add %rax, $acc2
mov $acc1, %rax
adc %rdx, $acc3 # doesn't overflow
#adc \$0, $t0
mulq $t1 mulq $t1
xor $acc1, $acc1 shr \$32, $t0
#add $t0, $acc4 add $acc1, $acc2
#adc \$0, %rdx adc $t0, $acc3
add %rax, $acc4 adc %rax, $acc0
mov $acc2, %rax mov $acc2, %rax
adc %rdx, $acc0 adc \$0, %rdx
adc \$0, $acc1
########################################## ##########################################
# Third iteration # Third iteration
mulq $a_ptr mov $acc2, $t0
#xor $t0, $t0 shl \$32, $acc2
add $acc2, $acc3 mov %rdx, $acc1
adc \$0, %rdx
add %rax, $acc3
mov $acc2, %rax
adc %rdx, $acc4 # doesn't overflow
#adc \$0, $t0
mulq $t1 mulq $t1
xor $acc2, $acc2 shr \$32, $t0
#add $t0, $acc0 add $acc2, $acc3
#adc \$0, %rdx adc $t0, $acc0
add %rax, $acc0 adc %rax, $acc1
mov $acc3, %rax mov $acc3, %rax
adc %rdx, $acc1 adc \$0, %rdx
adc \$0, $acc2
########################################### ###########################################
# Last iteration # Last iteration
mulq $a_ptr mov $acc3, $t0
#xor $t0, $t0 shl \$32, $acc3
add $acc3, $acc4 mov %rdx, $acc2
adc \$0, %rdx
add %rax, $acc4
mov $acc3, %rax
adc %rdx, $acc0 # doesn't overflow
#adc \$0, $t0
mulq $t1 mulq $t1
shr \$32, $t0
add $acc3, $acc0
adc $t0, $acc1
adc %rax, $acc2
adc \$0, %rdx
xor $acc3, $acc3 xor $acc3, $acc3
#add $t0, $acc1
#adc \$0, %rdx
add %rax, $acc1
adc %rdx, $acc2
adc \$0, $acc3
############################################ ############################################
# Add the rest of the acc # Add the rest of the acc
add $acc0, $acc5 add $acc0, $acc4
adc $acc1, $acc5
mov $acc4, $acc0 mov $acc4, $acc0
adc $acc1, $acc6 adc $acc2, $acc6
adc $acc2, $acc7 adc %rdx, $acc7
mov $acc5, $acc1 mov $acc5, $acc1
adc \$0, $acc3 adc \$0, $acc3
...@@ -989,14 +937,14 @@ __ecp_nistz256_sqr_montq: ...@@ -989,14 +937,14 @@ __ecp_nistz256_sqr_montq:
sbb \$0, $acc6 # .Lpoly[2] sbb \$0, $acc6 # .Lpoly[2]
mov $acc7, $t0 mov $acc7, $t0
sbb $t1, $acc7 # .Lpoly[3] sbb $t1, $acc7 # .Lpoly[3]
neg $acc3 sbb \$0, $acc3
cmovnc $acc0, $acc4 cmovc $acc0, $acc4
cmovnc $acc1, $acc5 cmovc $acc1, $acc5
mov $acc4, 8*0($r_ptr) mov $acc4, 8*0($r_ptr)
cmovnc $acc2, $acc6 cmovc $acc2, $acc6
mov $acc5, 8*1($r_ptr) mov $acc5, 8*1($r_ptr)
cmovnc $t0, $acc7 cmovc $t0, $acc7
mov $acc6, 8*2($r_ptr) mov $acc6, 8*2($r_ptr)
mov $acc7, 8*3($r_ptr) mov $acc7, 8*3($r_ptr)
...@@ -1028,18 +976,15 @@ __ecp_nistz256_mul_montx: ...@@ -1028,18 +976,15 @@ __ecp_nistz256_mul_montx:
######################################################################## ########################################################################
# First reduction step # First reduction step
xor $acc0, $acc0 # $acc0=0,cf=0,of=0 add $t1, $acc1
adox $t1, $acc1 adc $t0, $acc2
adox $t0, $acc2
mulx $poly3, $t0, $t1 mulx $poly3, $t0, $t1
mov 8*1($b_ptr), %rdx mov 8*1($b_ptr), %rdx
adox $t0, $acc3 adc $t0, $acc3
adcx $t1, $acc4 adc $t1, $acc4
adc \$0, $acc5
adox $acc0, $acc4 xor $acc0, $acc0 # $acc0=0,cf=0,of=0
adcx $acc0, $acc5 # cf=0
adox $acc0, $acc5 # of=0
######################################################################## ########################################################################
# Multiply by b[1] # Multiply by b[1]
...@@ -1068,18 +1013,15 @@ __ecp_nistz256_mul_montx: ...@@ -1068,18 +1013,15 @@ __ecp_nistz256_mul_montx:
######################################################################## ########################################################################
# Second reduction step # Second reduction step
xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0 add $t0, $acc2
adox $t0, $acc2 adc $t1, $acc3
adox $t1, $acc3
mulx $poly3, $t0, $t1 mulx $poly3, $t0, $t1
mov 8*2($b_ptr), %rdx mov 8*2($b_ptr), %rdx
adox $t0, $acc4 adc $t0, $acc4
adcx $t1, $acc5 adc $t1, $acc5
adc \$0, $acc0
adox $acc1, $acc5 xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0
adcx $acc1, $acc0 # cf=0
adox $acc1, $acc0 # of=0
######################################################################## ########################################################################
# Multiply by b[2] # Multiply by b[2]
...@@ -1108,18 +1050,15 @@ __ecp_nistz256_mul_montx: ...@@ -1108,18 +1050,15 @@ __ecp_nistz256_mul_montx:
######################################################################## ########################################################################
# Third reduction step # Third reduction step
xor $acc2, $acc2 # $acc2=0,cf=0,of=0 add $t0, $acc3
adox $t0, $acc3 adc $t1, $acc4
adox $t1, $acc4
mulx $poly3, $t0, $t1 mulx $poly3, $t0, $t1
mov 8*3($b_ptr), %rdx mov 8*3($b_ptr), %rdx
adox $t0, $acc5 adc $t0, $acc5
adcx $t1, $acc0 adc $t1, $acc0
adc \$0, $acc1
adox $acc2, $acc0 xor $acc2, $acc2 # $acc2=0,cf=0,of=0
adcx $acc2, $acc1 # cf=0
adox $acc2, $acc1 # of=0
######################################################################## ########################################################################
# Multiply by b[3] # Multiply by b[3]
...@@ -1148,38 +1087,34 @@ __ecp_nistz256_mul_montx: ...@@ -1148,38 +1087,34 @@ __ecp_nistz256_mul_montx:
######################################################################## ########################################################################
# Fourth reduction step # Fourth reduction step
xor $acc3, $acc3 # $acc3=0,cf=0,of=0 add $t0, $acc4
adox $t0, $acc4 adc $t1, $acc5
adox $t1, $acc5
mulx $poly3, $t0, $t1 mulx $poly3, $t0, $t1
mov $acc4, $t2 mov $acc4, $t2
mov .Lpoly+8*1(%rip), $poly1 mov .Lpoly+8*1(%rip), $poly1
adcx $t0, $acc0 adc $t0, $acc0
adox $t1, $acc1
mov $acc5, $t3 mov $acc5, $t3
adc $t1, $acc1
adcx $acc3, $acc1
adox $acc3, $acc2
adc \$0, $acc2 adc \$0, $acc2
mov $acc0, $t0
######################################################################## ########################################################################
# Branch-less conditional subtraction of P # Branch-less conditional subtraction of P
xor %eax, %eax xor %eax, %eax
mov $acc0, $t0
sbb \$-1, $acc4 # .Lpoly[0] sbb \$-1, $acc4 # .Lpoly[0]
sbb $poly1, $acc5 # .Lpoly[1] sbb $poly1, $acc5 # .Lpoly[1]
sbb \$0, $acc0 # .Lpoly[2] sbb \$0, $acc0 # .Lpoly[2]
mov $acc1, $t1 mov $acc1, $t1
sbb $poly3, $acc1 # .Lpoly[3] sbb $poly3, $acc1 # .Lpoly[3]
sbb \$0, $acc2
bt \$0,$acc2 cmovc $t2, $acc4
cmovnc $t2, $acc4 cmovc $t3, $acc5
cmovnc $t3, $acc5
mov $acc4, 8*0($r_ptr) mov $acc4, 8*0($r_ptr)
cmovnc $t0, $acc0 cmovc $t0, $acc0
mov $acc5, 8*1($r_ptr) mov $acc5, 8*1($r_ptr)
cmovnc $t1, $acc1 cmovc $t1, $acc1
mov $acc0, 8*2($r_ptr) mov $acc0, 8*2($r_ptr)
mov $acc1, 8*3($r_ptr) mov $acc1, 8*3($r_ptr)
...@@ -1247,52 +1182,44 @@ __ecp_nistz256_sqr_montx: ...@@ -1247,52 +1182,44 @@ __ecp_nistz256_sqr_montx:
mov .Lpoly+8*3(%rip), $t1 mov .Lpoly+8*3(%rip), $t1
# reduction step 1 # reduction step 1
xor $acc0, $acc0 add $t0, $acc1
adcx $t0, $acc1 adc $t4, $acc2
adcx $t4, $acc2
mulx $t1, $t0, $t4 mulx $t1, $t0, $acc0
mov $acc1, %rdx mov $acc1, %rdx
adcx $t0, $acc3 adc $t0, $acc3
shlx $a_ptr, $acc1, $t0 shlx $a_ptr, $acc1, $t0
adox $t4, $acc0
shrx $a_ptr, $acc1, $t4
adc \$0, $acc0 adc \$0, $acc0
shrx $a_ptr, $acc1, $t4
# reduction step 2 # reduction step 2
xor $acc1, $acc1 add $t0, $acc2
adcx $t0, $acc2 adc $t4, $acc3
adcx $t4, $acc3
mulx $t1, $t0, $t4 mulx $t1, $t0, $acc1
mov $acc2, %rdx mov $acc2, %rdx
adcx $t0, $acc0 adc $t0, $acc0
shlx $a_ptr, $acc2, $t0 shlx $a_ptr, $acc2, $t0
adox $t4, $acc1
shrx $a_ptr, $acc2, $t4
adc \$0, $acc1 adc \$0, $acc1
shrx $a_ptr, $acc2, $t4
# reduction step 3 # reduction step 3
xor $acc2, $acc2 add $t0, $acc3
adcx $t0, $acc3 adc $t4, $acc0
adcx $t4, $acc0
mulx $t1, $t0, $t4 mulx $t1, $t0, $acc2
mov $acc3, %rdx mov $acc3, %rdx
adcx $t0, $acc1 adc $t0, $acc1
shlx $a_ptr, $acc3, $t0 shlx $a_ptr, $acc3, $t0
adox $t4, $acc2
shrx $a_ptr, $acc3, $t4
adc \$0, $acc2 adc \$0, $acc2
shrx $a_ptr, $acc3, $t4
# reduction step 4 # reduction step 4
xor $acc3, $acc3 add $t0, $acc0
adcx $t0, $acc0 adc $t4, $acc1
adcx $t4, $acc1
mulx $t1, $t0, $t4 mulx $t1, $t0, $acc3
adcx $t0, $acc2 adc $t0, $acc2
adox $t4, $acc3
adc \$0, $acc3 adc \$0, $acc3
xor $t3, $t3 # cf=0 xor $t3, $t3 # cf=0
...@@ -1312,14 +1239,14 @@ __ecp_nistz256_sqr_montx: ...@@ -1312,14 +1239,14 @@ __ecp_nistz256_sqr_montx:
sbb \$0, $acc6 # .Lpoly[2] sbb \$0, $acc6 # .Lpoly[2]
mov $acc7, $acc3 mov $acc7, $acc3
sbb $t1, $acc7 # .Lpoly[3] sbb $t1, $acc7 # .Lpoly[3]
sbb \$0, $t3
bt \$0,$t3 cmovc $acc0, $acc4
cmovnc $acc0, $acc4 cmovc $acc1, $acc5
cmovnc $acc1, $acc5
mov $acc4, 8*0($r_ptr) mov $acc4, 8*0($r_ptr)
cmovnc $acc2, $acc6 cmovc $acc2, $acc6
mov $acc5, 8*1($r_ptr) mov $acc5, 8*1($r_ptr)
cmovnc $acc3, $acc7 cmovc $acc3, $acc7
mov $acc6, 8*2($r_ptr) mov $acc6, 8*2($r_ptr)
mov $acc7, 8*3($r_ptr) mov $acc7, 8*3($r_ptr)
...@@ -1330,8 +1257,8 @@ ___ ...@@ -1330,8 +1257,8 @@ ___
} }
{ {
my ($r_ptr,$in_ptr)=("%rdi","%rsi"); my ($r_ptr,$in_ptr)=("%rdi","%rsi");
my ($acc0,$acc1,$acc2,$acc3,$acc4)=map("%r$_",(8..12)); my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
my ($t0,$t1)=("%rcx","%rsi"); my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
$code.=<<___; $code.=<<___;
################################################################################ ################################################################################
...@@ -1348,109 +1275,83 @@ ecp_nistz256_from_mont: ...@@ -1348,109 +1275,83 @@ ecp_nistz256_from_mont:
push %r13 push %r13
mov 8*0($in_ptr), %rax mov 8*0($in_ptr), %rax
mov .Lpoly+8*3(%rip), $t2
mov 8*1($in_ptr), $acc1 mov 8*1($in_ptr), $acc1
mov 8*2($in_ptr), $acc2 mov 8*2($in_ptr), $acc2
mov 8*3($in_ptr), $acc3 mov 8*3($in_ptr), $acc3
lea .Lpoly(%rip), $in_ptr
xor $acc4, $acc4
mov %rax, $acc0 mov %rax, $acc0
mov .Lpoly+8*1(%rip), $t1
######################################### #########################################
# First iteration # First iteration
mulq 1*8($in_ptr) mov %rax, $t0
xor $t0, $t0 shl \$32, $acc0
mulq $t2
shr \$32, $t0
add $acc0, $acc1 add $acc0, $acc1
adc \$0, %rdx adc $t0, $acc2
add %rax, $acc1 adc %rax, $acc3
mov $acc0, %rax
adc %rdx, $acc2
adc \$0, $t0
mulq 3*8($in_ptr)
xor $acc0, $acc0
add $t0, $acc3
adc \$0, %rdx
add %rax, $acc3
mov $acc1, %rax mov $acc1, %rax
adc %rdx, $acc4 adc \$0, %rdx
adc \$0, $acc0
######################################### #########################################
# Second iteration # Second iteration
mulq 1*8($in_ptr) mov $acc1, $t0
xor $t0, $t0 shl \$32, $acc1
mov %rdx, $acc0
mulq $t2
shr \$32, $t0
add $acc1, $acc2 add $acc1, $acc2
adc \$0, %rdx adc $t0, $acc3
add %rax, $acc2 adc %rax, $acc0
mov $acc1, %rax
adc %rdx, $acc3
adc \$0, $t0
mulq 3*8($in_ptr)
xor $acc1, $acc1
add $t0, $acc4
adc \$0, %rdx
add %rax, $acc4
mov $acc2, %rax mov $acc2, %rax
adc %rdx, $acc0 adc \$0, %rdx
adc \$0, $acc1
########################################## ##########################################
# Third iteration # Third iteration
mulq 1*8($in_ptr) mov $acc2, $t0
xor $t0, $t0 shl \$32, $acc2
mov %rdx, $acc1
mulq $t2
shr \$32, $t0
add $acc2, $acc3 add $acc2, $acc3
adc \$0, %rdx adc $t0, $acc0
add %rax, $acc3 adc %rax, $acc1
mov $acc2, %rax
adc %rdx, $acc4
adc \$0, $t0
mulq 3*8($in_ptr)
xor $acc2, $acc2
add $t0, $acc0
adc \$0, %rdx
add %rax, $acc0
mov $acc3, %rax mov $acc3, %rax
adc %rdx, $acc1 adc \$0, %rdx
adc \$0, $acc2
########################################### ###########################################
# Last iteration # Last iteration
mulq 1*8($in_ptr) mov $acc3, $t0
xor $t0, $t0 shl \$32, $acc3
add $acc3, $acc4 mov %rdx, $acc2
adc \$0, %rdx mulq $t2
add %rax, $acc4 shr \$32, $t0
mov $acc3, %rax add $acc3, $acc0
adc %rdx, $acc0 adc $t0, $acc1
adc \$0, $t0 mov $acc0, $t0
adc %rax, $acc2
mulq 3*8($in_ptr) mov $acc1, $in_ptr
add $t0, $acc1
adc \$0, %rdx adc \$0, %rdx
add %rax, $acc1
adc %rdx, $acc2
sbb $acc3, $acc3
mov 0*8($in_ptr), %rax ###########################################
mov 1*8($in_ptr), %rdx # Branch-less conditional subtraction
mov 2*8($in_ptr), $t0 sub \$-1, $acc0
mov 3*8($in_ptr), $t1 mov $acc2, %rax
sbb $t1, $acc1
and $acc3, %rax sbb \$0, $acc2
and $acc3, %rdx mov %rdx, $acc3
and $acc3, $t0 sbb $t2, %rdx
and $acc3, $t1 sbb $t2, $t2
sub %rax, $acc4 cmovnz $t0, $acc0
sbb %rdx, $acc0 cmovnz $in_ptr, $acc1
mov $acc4, 8*0($r_ptr) mov $acc0, 8*0($r_ptr)
sbb $t0, $acc1 cmovnz %rax, $acc2
mov $acc0, 8*1($r_ptr) mov $acc1, 8*1($r_ptr)
sbb $t1, $acc2 cmovz %rdx, $acc3
mov $acc1, 8*2($r_ptr) mov $acc2, 8*2($r_ptr)
mov $acc2, 8*3($r_ptr) mov $acc3, 8*3($r_ptr)
pop %r13 pop %r13
pop %r12 pop %r12
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册