ecp_nistz256-x86_64.pl: fix occasional failures.

RT: 3607 Reviewed-by: N Adam Langley <agl@google.com> Reviewed-by: N Emilia Kasper <emilia@openssl.org>

ecp_nistz256-x86_64.pl: fix occasional failures.
RT: 3607 Reviewed-by: N Adam Langley <agl@google.com> Reviewed-by: N Emilia Kasper <emilia@openssl.org>
9e557ab2 · Andy Polyakov · 2c60925d · 9e557ab2
显示空白变更内容
内联并排

Showing with 191 addition and 290 deletion

crypto/ec/asm/ecp_nistz256-x86_64.pl crypto/ec/asm/ecp_nistz256-x86_64.pl +191 -290

未找到文件。
--- a/crypto/ec/asm/ecp_nistz256-x86_64.pl
+++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl
@@ -31,15 +31,16 @@
 # Further optimization by <appro@openssl.org>:
 #
 #		this/original
-# Opteron	+8-33%
+# Opteron	+12-49%
-# Bulldozer	+10-30%
+# Bulldozer	+14-45%
-# P4		+14-38%
+# P4		+18-46%
-# Westmere	+8-23%
+# Westmere	+12-34%
-# Sandy Bridge	+8-24%
+# Sandy Bridge	+9-35%
-# Ivy Bridge	+7-25%
+# Ivy Bridge	+9-35%
-# Haswell	+5-25%
+# Haswell	+8-37%
-# Atom		+10-32%
+# Broadwell	+18-58%
-# VIA Nano	+37-130%
+# Atom		+15-50%
+# VIA Nano	+43-160%
 #
 # Ranges denote minimum and maximum improvement coefficients depending
 # on benchmark. Lower coefficients are for ECDSA sign, relatively
@@ -550,28 +551,20 @@ __ecp_nistz256_mul_montq:
 	# and add the result to the acc.
 	# Due to the special form of p256 we do some optimizations
 	#
-	# acc[0] x p256[0] = acc[0] x 2^64 - acc[0]
+	# acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
-	# then we add acc[0] and get acc[0] x 2^64
+	# then we add acc[0] and get acc[0] x 2^96
-	mulq	$poly1
-	xor	$t0, $t0
-	add	$acc0, $acc1		# +=acc[0]*2^64
-	adc	\$0, %rdx
-	add	%rax, $acc1
-	mov	$acc0, %rax
-	# acc[0] x p256[2] = 0
-	adc	%rdx, $acc2
-	adc	\$0, $t0
+	mov	$acc0, $t1
+	shl	\$32, $acc0
 	mulq	$poly3
-	xor	$acc0, $acc0
+	shr	\$32, $t1
-	add	$t0, $acc3
+	add	$acc0, $acc1		# +=acc[0]<<96
-	adc	\$0, %rdx
+	adc	$t1, $acc2
-	add	%rax, $acc3
+	adc	%rax, $acc3
 	 mov	8*1($b_ptr), %rax
 	adc	%rdx, $acc4
 	adc	\$0, $acc5
+	xor	$acc0, $acc0
 	########################################################################
 	# Multiply by b[1]
@@ -608,23 +601,17 @@ __ecp_nistz256_mul_montq:
 	########################################################################
 	# Second reduction step	
-	mulq	$poly1
+	mov	$acc1, $t1
-	xor	$t0, $t0
+	shl	\$32, $acc1
-	add	$acc1, $acc2
-	adc	\$0, %rdx
-	add	%rax, $acc2
-	mov	$acc1, %rax
-	adc	%rdx, $acc3
-	adc	\$0, $t0
 	mulq	$poly3
-	xor	$acc1, $acc1
+	shr	\$32, $t1
-	add	$t0, $acc4
+	add	$acc1, $acc2
-	adc	\$0, %rdx
+	adc	$t1, $acc3
-	add	%rax, $acc4
+	adc	%rax, $acc4
 	 mov	8*2($b_ptr), %rax
 	adc	%rdx, $acc5
 	adc	\$0, $acc0
+	xor	$acc1, $acc1
 	########################################################################
 	# Multiply by b[2]
@@ -661,23 +648,17 @@ __ecp_nistz256_mul_montq:
 	########################################################################
 	# Third reduction step	
-	mulq	$poly1
+	mov	$acc2, $t1
-	xor	$t0, $t0
+	shl	\$32, $acc2
-	add	$acc2, $acc3
-	adc	\$0, %rdx
-	add	%rax, $acc3
-	mov	$acc2, %rax
-	adc	%rdx, $acc4
-	adc	\$0, $t0
 	mulq	$poly3
-	xor	$acc2, $acc2
+	shr	\$32, $t1
-	add	$t0, $acc5
+	add	$acc2, $acc3
-	adc	\$0, %rdx
+	adc	$t1, $acc4
-	add	%rax, $acc5
+	adc	%rax, $acc5
 	 mov	8*3($b_ptr), %rax
 	adc	%rdx, $acc0
 	adc	\$0, $acc1
+	xor	$acc2, $acc2
 	########################################################################
 	# Multiply by b[3]
@@ -714,20 +695,14 @@ __ecp_nistz256_mul_montq:
 	########################################################################
 	# Final reduction step	
-	mulq	$poly1
+	mov	$acc3, $t1
-	#xor	$t0, $t0
+	shl	\$32, $acc3
-	add	$acc3, $acc4
-	adc	\$0, %rdx
-	add	%rax, $acc4
-	mov	$acc3, %rax
-	adc	%rdx, $acc5
-	#adc	\$0, $t0		# doesn't overflow
 	mulq	$poly3
-	#add	$t0, $acc0
+	shr	\$32, $t1
-	#adc	\$0, %rdx
+	add	$acc3, $acc4
+	adc	$t1, $acc5
 	 mov	$acc4, $t0
-	add	%rax, $acc0
+	adc	%rax, $acc0
 	adc	%rdx, $acc1
 	 mov	$acc5, $t1
 	adc	\$0, $acc2
@@ -740,14 +715,14 @@ __ecp_nistz256_mul_montq:
 	sbb	\$0, $acc0		# .Lpoly[2]
 	 mov	$acc1, $t3
 	sbb	$poly3, $acc1		# .Lpoly[3]
-	neg	$acc2
+	sbb	\$0, $acc2
-	cmovnc	$t0, $acc4
+	cmovc	$t0, $acc4
-	cmovnc	$t1, $acc5
+	cmovc	$t1, $acc5
 	mov	$acc4, 8*0($r_ptr)
-	cmovnc	$t2, $acc0
+	cmovc	$t2, $acc0
 	mov	$acc5, 8*1($r_ptr)
-	cmovnc	$t3, $acc1
+	cmovc	$t3, $acc1
 	mov	$acc0, 8*2($r_ptr)
 	mov	$acc1, 8*3($r_ptr)
@@ -897,89 +872,62 @@ __ecp_nistz256_sqr_montq:
 	##########################################
 	# Now the reduction
 	# First iteration
-	mulq	$a_ptr
+	mov	$acc0, $t0
-	#xor	$t0, $t0
+	shl	\$32, $acc0
-	add	$acc0, $acc1
-	adc	\$0, %rdx
-	add	%rax, $acc1
-	mov	$acc0, %rax
-	adc	%rdx, $acc2	# doesn't overflow
-	#adc	\$0, $t0
 	mulq	$t1
-	xor	$acc0, $acc0
+	shr	\$32, $t0
-	#add	$t0, $acc3
+	add	$acc0, $acc1		# +=acc[0]<<96
-	#adc	\$0, %rdx
+	adc	$t0, $acc2
-	add	%rax, $acc3
+	adc	%rax, $acc3
 	 mov	$acc1, %rax
-	adc	%rdx, $acc4
+	adc	\$0, %rdx
-	adc	\$0, $acc0
 	##########################################
 	# Second iteration
-	mulq	$a_ptr
+	mov	$acc1, $t0
-	#xor	$t0, $t0
+	shl	\$32, $acc1
-	add	$acc1, $acc2
+	mov	%rdx, $acc0
-	adc	\$0, %rdx
-	add	%rax, $acc2
-	mov	$acc1, %rax
-	adc	%rdx, $acc3	# doesn't overflow
-	#adc	\$0, $t0
 	mulq	$t1
-	xor	$acc1, $acc1
+	shr	\$32, $t0
-	#add	$t0, $acc4
+	add	$acc1, $acc2
-	#adc	\$0, %rdx
+	adc	$t0, $acc3
-	add	%rax, $acc4
+	adc	%rax, $acc0
 	 mov	$acc2, %rax
-	adc	%rdx, $acc0
+	adc	\$0, %rdx
-	adc	\$0, $acc1
 	##########################################
 	# Third iteration
-	mulq	$a_ptr
+	mov	$acc2, $t0
-	#xor	$t0, $t0
+	shl	\$32, $acc2
-	add	$acc2, $acc3
+	mov	%rdx, $acc1
-	adc	\$0, %rdx
-	add	%rax, $acc3
-	mov	$acc2, %rax
-	adc	%rdx, $acc4	# doesn't overflow
-	#adc	\$0, $t0
 	mulq	$t1
-	xor	$acc2, $acc2
+	shr	\$32, $t0
-	#add	$t0, $acc0
+	add	$acc2, $acc3
-	#adc	\$0, %rdx
+	adc	$t0, $acc0
-	add	%rax, $acc0
+	adc	%rax, $acc1
 	 mov	$acc3, %rax
-	adc	%rdx, $acc1
+	adc	\$0, %rdx
-	adc	\$0, $acc2
 	###########################################
 	# Last iteration
-	mulq	$a_ptr
+	mov	$acc3, $t0
-	#xor	$t0, $t0
+	shl	\$32, $acc3
-	add	$acc3, $acc4
+	mov	%rdx, $acc2
-	adc	\$0, %rdx
-	add	%rax, $acc4
-	mov	$acc3, %rax
-	adc	%rdx, $acc0	# doesn't overflow
-	#adc	\$0, $t0
 	mulq	$t1
+	shr	\$32, $t0
+	add	$acc3, $acc0
+	adc	$t0, $acc1
+	adc	%rax, $acc2
+	adc	\$0, %rdx
 	xor	$acc3, $acc3
-	#add	$t0, $acc1
-	#adc	\$0, %rdx
-	add	%rax, $acc1
-	adc	%rdx, $acc2
-	adc	\$0, $acc3
 	############################################
 	# Add the rest of the acc
-	add	$acc0, $acc5
+	add	$acc0, $acc4
+	adc	$acc1, $acc5
 	 mov	$acc4, $acc0
-	adc	$acc1, $acc6
+	adc	$acc2, $acc6
-	adc	$acc2, $acc7
+	adc	%rdx, $acc7
 	 mov	$acc5, $acc1
 	adc	\$0, $acc3
@@ -989,14 +937,14 @@ __ecp_nistz256_sqr_montq:
 	sbb	\$0, $acc6		# .Lpoly[2]
 	 mov	$acc7, $t0
 	sbb	$t1, $acc7		# .Lpoly[3]
-	neg	$acc3
+	sbb	\$0, $acc3
-	cmovnc	$acc0, $acc4
+	cmovc	$acc0, $acc4
-	cmovnc	$acc1, $acc5
+	cmovc	$acc1, $acc5
 	mov	$acc4, 8*0($r_ptr)
-	cmovnc	$acc2, $acc6
+	cmovc	$acc2, $acc6
 	mov	$acc5, 8*1($r_ptr)
-	cmovnc	$t0, $acc7
+	cmovc	$t0, $acc7
 	mov	$acc6, 8*2($r_ptr)
 	mov	$acc7, 8*3($r_ptr)
@@ -1028,18 +976,15 @@ __ecp_nistz256_mul_montx:
 	########################################################################
 	# First reduction step
-	xor	$acc0, $acc0		# $acc0=0,cf=0,of=0
+	add	$t1, $acc1
-	adox	$t1, $acc1
+	adc	$t0, $acc2
-	adox	$t0, $acc2
 	mulx	$poly3, $t0, $t1
 	 mov	8*1($b_ptr), %rdx
-	adox	$t0, $acc3
+	adc	$t0, $acc3
-	adcx	$t1, $acc4
+	adc	$t1, $acc4
+	adc	\$0, $acc5
-	adox	$acc0, $acc4
+	xor	$acc0, $acc0		# $acc0=0,cf=0,of=0
-	adcx	$acc0, $acc5		# cf=0
-	adox	$acc0, $acc5		# of=0
 	########################################################################
 	# Multiply by b[1]
@@ -1068,18 +1013,15 @@ __ecp_nistz256_mul_montx:
 	########################################################################
 	# Second reduction step
-	xor	$acc1 ,$acc1		# $acc1=0,cf=0,of=0
+	add	$t0, $acc2
-	adox	$t0, $acc2
+	adc	$t1, $acc3
-	adox	$t1, $acc3
 	mulx	$poly3, $t0, $t1
 	 mov	8*2($b_ptr), %rdx
-	adox	$t0, $acc4
+	adc	$t0, $acc4
-	adcx	$t1, $acc5
+	adc	$t1, $acc5
+	adc	\$0, $acc0
-	adox	$acc1, $acc5
+	xor	$acc1 ,$acc1		# $acc1=0,cf=0,of=0
-	adcx	$acc1, $acc0		# cf=0
-	adox	$acc1, $acc0		# of=0
 	########################################################################
 	# Multiply by b[2]
@@ -1108,18 +1050,15 @@ __ecp_nistz256_mul_montx:
 	########################################################################
 	# Third reduction step
-	xor	$acc2, $acc2		# $acc2=0,cf=0,of=0
+	add	$t0, $acc3
-	adox	$t0, $acc3
+	adc	$t1, $acc4
-	adox	$t1, $acc4
 	mulx	$poly3, $t0, $t1
 	 mov	8*3($b_ptr), %rdx
-	adox	$t0, $acc5
+	adc	$t0, $acc5
-	adcx	$t1, $acc0
+	adc	$t1, $acc0
+	adc	\$0, $acc1
-	adox	$acc2, $acc0
+	xor	$acc2, $acc2		# $acc2=0,cf=0,of=0
-	adcx	$acc2, $acc1		# cf=0
-	adox	$acc2, $acc1		# of=0
 	########################################################################
 	# Multiply by b[3]
@@ -1148,38 +1087,34 @@ __ecp_nistz256_mul_montx:
 	########################################################################
 	# Fourth reduction step
-	xor	$acc3, $acc3		# $acc3=0,cf=0,of=0
+	add	$t0, $acc4
-	adox	$t0, $acc4
+	adc	$t1, $acc5
-	adox	$t1, $acc5
 	mulx	$poly3, $t0, $t1
 	 mov	$acc4, $t2
 	mov	.Lpoly+8*1(%rip), $poly1
-	adcx	$t0, $acc0
+	adc	$t0, $acc0
-	adox	$t1, $acc1
 	 mov	$acc5, $t3
+	adc	$t1, $acc1
-	adcx	$acc3, $acc1
-	adox	$acc3, $acc2
 	adc	\$0, $acc2
-	 mov	$acc0, $t0
 	########################################################################
 	# Branch-less conditional subtraction of P
 	xor	%eax, %eax
+	 mov	$acc0, $t0
 	sbb	\$-1, $acc4		# .Lpoly[0]
 	sbb	$poly1, $acc5		# .Lpoly[1]
 	sbb	\$0, $acc0		# .Lpoly[2]
 	 mov	$acc1, $t1
 	sbb	$poly3, $acc1		# .Lpoly[3]
+	sbb	\$0, $acc2
-	bt	\$0,$acc2
+	cmovc	$t2, $acc4
-	cmovnc	$t2, $acc4
+	cmovc	$t3, $acc5
-	cmovnc	$t3, $acc5
 	mov	$acc4, 8*0($r_ptr)
-	cmovnc	$t0, $acc0
+	cmovc	$t0, $acc0
 	mov	$acc5, 8*1($r_ptr)
-	cmovnc	$t1, $acc1
+	cmovc	$t1, $acc1
 	mov	$acc0, 8*2($r_ptr)
 	mov	$acc1, 8*3($r_ptr)
@@ -1247,52 +1182,44 @@ __ecp_nistz256_sqr_montx:
 	 mov	.Lpoly+8*3(%rip), $t1
 	# reduction step 1
-	xor	$acc0, $acc0
+	add	$t0, $acc1
-	adcx	$t0, $acc1
+	adc	$t4, $acc2
-	adcx	$t4, $acc2
-	mulx	$t1, $t0, $t4
+	mulx	$t1, $t0, $acc0
 	 mov	$acc1, %rdx
-	adcx	$t0, $acc3
+	adc	$t0, $acc3
 	 shlx	$a_ptr, $acc1, $t0
-	adox	$t4, $acc0
-	 shrx	$a_ptr, $acc1, $t4
 	adc	\$0, $acc0
+	 shrx	$a_ptr, $acc1, $t4
 	# reduction step 2
-	xor	$acc1, $acc1
+	add	$t0, $acc2
-	adcx	$t0, $acc2
+	adc	$t4, $acc3
-	adcx	$t4, $acc3
-	mulx	$t1, $t0, $t4
+	mulx	$t1, $t0, $acc1
 	 mov	$acc2, %rdx
-	adcx	$t0, $acc0
+	adc	$t0, $acc0
 	 shlx	$a_ptr, $acc2, $t0
-	adox	$t4, $acc1
-	 shrx	$a_ptr, $acc2, $t4
 	adc	\$0, $acc1
+	 shrx	$a_ptr, $acc2, $t4
 	# reduction step 3
-	xor	$acc2, $acc2
+	add	$t0, $acc3
-	adcx	$t0, $acc3
+	adc	$t4, $acc0
-	adcx	$t4, $acc0
-	mulx	$t1, $t0, $t4
+	mulx	$t1, $t0, $acc2
 	 mov	$acc3, %rdx
-	adcx	$t0, $acc1
+	adc	$t0, $acc1
 	 shlx	$a_ptr, $acc3, $t0
-	adox	$t4, $acc2
-	 shrx	$a_ptr, $acc3, $t4
 	adc	\$0, $acc2
+	 shrx	$a_ptr, $acc3, $t4
 	# reduction step 4
-	xor	$acc3, $acc3
+	add	$t0, $acc0
-	adcx	$t0, $acc0
+	adc	$t4, $acc1
-	adcx	$t4, $acc1
-	mulx	$t1, $t0, $t4
+	mulx	$t1, $t0, $acc3
-	adcx	$t0, $acc2
+	adc	$t0, $acc2
-	adox	$t4, $acc3
 	adc	\$0, $acc3
 	xor	$t3, $t3		# cf=0
@@ -1312,14 +1239,14 @@ __ecp_nistz256_sqr_montx:
 	sbb	\$0, $acc6		# .Lpoly[2]
 	 mov	$acc7, $acc3
 	sbb	$t1, $acc7		# .Lpoly[3]
+	sbb	\$0, $t3
-	bt	\$0,$t3
+	cmovc	$acc0, $acc4
-	cmovnc	$acc0, $acc4
+	cmovc	$acc1, $acc5
-	cmovnc	$acc1, $acc5
 	mov	$acc4, 8*0($r_ptr)
-	cmovnc	$acc2, $acc6
+	cmovc	$acc2, $acc6
 	mov	$acc5, 8*1($r_ptr)
-	cmovnc	$acc3, $acc7
+	cmovc	$acc3, $acc7
 	mov	$acc6, 8*2($r_ptr)
 	mov	$acc7, 8*3($r_ptr)
@@ -1330,8 +1257,8 @@ ___
 }
 {
 my ($r_ptr,$in_ptr)=("%rdi","%rsi");
-my ($acc0,$acc1,$acc2,$acc3,$acc4)=map("%r$_",(8..12));
+my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
-my ($t0,$t1)=("%rcx","%rsi");
+my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
 $code.=<<___;
 ################################################################################
@@ -1348,109 +1275,83 @@ ecp_nistz256_from_mont:
 	push	%r13
 	mov	8*0($in_ptr), %rax
+	mov	.Lpoly+8*3(%rip), $t2
 	mov	8*1($in_ptr), $acc1
 	mov	8*2($in_ptr), $acc2
 	mov	8*3($in_ptr), $acc3
-	lea	.Lpoly(%rip), $in_ptr
-	xor	$acc4, $acc4
 	mov	%rax, $acc0
+	mov	.Lpoly+8*1(%rip), $t1
 	#########################################
 	# First iteration
-	mulq	1*8($in_ptr)
+	mov	%rax, $t0
-	xor	$t0, $t0
+	shl	\$32, $acc0
+	mulq	$t2
+	shr	\$32, $t0
 	add	$acc0, $acc1
-	adc	\$0, %rdx
+	adc	$t0, $acc2
-	add	%rax, $acc1
+	adc	%rax, $acc3
-	mov	$acc0, %rax
-	adc	%rdx, $acc2
-	adc	\$0, $t0
-	mulq	3*8($in_ptr)
-	xor	$acc0, $acc0
-	add	$t0, $acc3
-	adc	\$0, %rdx
-	add	%rax, $acc3
 	 mov	$acc1, %rax
-	adc	%rdx, $acc4
+	adc	\$0, %rdx
-	adc	\$0, $acc0
 	#########################################
 	# Second iteration
-	mulq	1*8($in_ptr)
+	mov	$acc1, $t0
-	xor	$t0, $t0
+	shl	\$32, $acc1
+	mov	%rdx, $acc0
+	mulq	$t2
+	shr	\$32, $t0
 	add	$acc1, $acc2
-	adc	\$0, %rdx
+	adc	$t0, $acc3
-	add	%rax, $acc2
+	adc	%rax, $acc0
-	mov	$acc1, %rax
-	adc	%rdx, $acc3
-	adc	\$0, $t0
-	mulq	3*8($in_ptr)
-	xor	$acc1, $acc1
-	add	$t0, $acc4
-	adc	\$0, %rdx
-	add	%rax, $acc4
 	 mov	$acc2, %rax
-	adc	%rdx, $acc0
+	adc	\$0, %rdx
-	adc	\$0, $acc1
 	##########################################
 	# Third iteration
-	mulq	1*8($in_ptr)
+	mov	$acc2, $t0
-	xor	$t0, $t0
+	shl	\$32, $acc2
+	mov	%rdx, $acc1
+	mulq	$t2
+	shr	\$32, $t0
 	add	$acc2, $acc3
-	adc	\$0, %rdx
+	adc	$t0, $acc0
-	add	%rax, $acc3
+	adc	%rax, $acc1
-	mov	$acc2, %rax
-	adc	%rdx, $acc4
-	adc	\$0, $t0
-	mulq	3*8($in_ptr)
-	xor	$acc2, $acc2
-	add	$t0, $acc0
-	adc	\$0, %rdx
-	add	%rax, $acc0
 	 mov	$acc3, %rax
-	adc	%rdx, $acc1
+	adc	\$0, %rdx
-	adc	\$0, $acc2
 	###########################################
 	# Last iteration
-	mulq	1*8($in_ptr)
+	mov	$acc3, $t0
-	xor	$t0, $t0
+	shl	\$32, $acc3
-	add	$acc3, $acc4
+	mov	%rdx, $acc2
-	adc	\$0, %rdx
+	mulq	$t2
-	add	%rax, $acc4
+	shr	\$32, $t0
-	mov	$acc3, %rax
+	add	$acc3, $acc0
-	adc	%rdx, $acc0
+	adc	$t0, $acc1
-	adc	\$0, $t0
+	 mov	$acc0, $t0
+	adc	%rax, $acc2
-	mulq	3*8($in_ptr)
+	 mov	$acc1, $in_ptr
-	add	$t0, $acc1
 	adc	\$0, %rdx
-	add	%rax, $acc1
-	adc	%rdx, $acc2
-	sbb	$acc3, $acc3
-	mov	0*8($in_ptr), %rax
-	mov	1*8($in_ptr), %rdx
-	mov	2*8($in_ptr), $t0
-	mov	3*8($in_ptr), $t1
-	and	$acc3, %rax
+	###########################################
-	and	$acc3, %rdx
+	# Branch-less conditional subtraction
-	and	$acc3, $t0
+	sub	\$-1, $acc0
-	and	$acc3, $t1
+	 mov	$acc2, %rax
+	sbb	$t1, $acc1
-	sub	%rax, $acc4
+	sbb	\$0, $acc2
-	sbb	%rdx, $acc0
+	 mov	%rdx, $acc3
-	mov	$acc4, 8*0($r_ptr)
+	sbb	$t2, %rdx
-	sbb	$t0, $acc1
+	sbb	$t2, $t2
-	mov	$acc0, 8*1($r_ptr)
-	sbb	$t1, $acc2
+	cmovnz	$t0, $acc0
-	mov	$acc1, 8*2($r_ptr)
+	cmovnz	$in_ptr, $acc1
-	mov	$acc2, 8*3($r_ptr)
+	mov	$acc0, 8*0($r_ptr)
+	cmovnz	%rax, $acc2
+	mov	$acc1, 8*1($r_ptr)
+	cmovz	%rdx, $acc3
+	mov	$acc2, 8*2($r_ptr)
+	mov	$acc3, 8*3($r_ptr)
 	pop	%r13
 	pop	%r12