crypto/bn/x86_64-mont5.pl: constant-time gather procedure.

At the same time remove miniscule bias in final subtraction. Performance penalty varies from platform to platform, and even with key length. For rsa2048 sign it was observed to be 4% for Sandy Bridge and 7% on Broadwell. CVE-2016-0702 Reviewed-by: N Richard Levitte <levitte@openssl.org> Reviewed-by: N Rich Salz <rsalz@openssl.org>

crypto/bn/x86_64-mont5.pl: constant-time gather procedure.
At the same time remove miniscule bias in final subtraction. Performance penalty varies from platform to platform, and even with key length. For rsa2048 sign it was observed to be 4% for Sandy Bridge and 7% on Broadwell. CVE-2016-0702 Reviewed-by: N Richard Levitte <levitte@openssl.org> Reviewed-by: N Rich Salz <rsalz@openssl.org>
8fc8f486 · Andy Polyakov · Matt Caswell · d6d422e1 · 8fc8f486 · 8fc8f486
Showing with 678 addition and 558 deletion

crypto/bn/asm/x86_64-mont.pl crypto/bn/asm/x86_64-mont.pl +6 -33

crypto/bn/asm/x86_64-mont5.pl crypto/bn/asm/x86_64-mont5.pl +658 -511

crypto/bn/bn_exp.c crypto/bn/bn_exp.c +14 -14

未找到文件。
--- a/crypto/bn/asm/x86_64-mont.pl
+++ b/crypto/bn/asm/x86_64-mont.pl
@@ -775,20 +775,20 @@ bn_sqr8x_mont:
 	# 4096. this is done to allow memory disambiguation logic
 	# do its job.
 	#
-	lea	-64(%rsp,$num,4),%r11
+	lea	-64(%rsp,$num,2),%r11
 	mov	($n0),$n0		# *n0
 	sub	$aptr,%r11
 	and	\$4095,%r11
 	cmp	%r11,%r10
 	jb	.Lsqr8x_sp_alt
 	sub	%r11,%rsp		# align with $aptr
-	lea	-64(%rsp,$num,4),%rsp	# alloca(frame+4*$num)
+	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
 	jmp	.Lsqr8x_sp_done

 .align	32
 .Lsqr8x_sp_alt:
-	lea	4096-64(,$num,4),%r10	# 4096-frame-4*$num
-	lea	-64(%rsp,$num,4),%rsp	# alloca(frame+4*$num)
+	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
+	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
 	sub	%r10,%r11
 	mov	\$0,%r10
 	cmovc	%r10,%r11
@@ -798,37 +798,17 @@ bn_sqr8x_mont:
 	mov	$num,%r10	
 	neg	$num

-	lea	64(%rsp,$num,2),%r11	# copy of modulus
 	mov	$n0,  32(%rsp)
 	mov	%rax, 40(%rsp)		# save original %rsp
 .Lsqr8x_body:

-	mov	$num,$i
-	movq	%r11, %xmm2		# save pointer to modulus copy
-	shr	\$3+2,$i
-	mov	OPENSSL_ia32cap_P+8(%rip),%eax
-	jmp	.Lsqr8x_copy_n
-
-.align	32
-.Lsqr8x_copy_n:
-	movq	8*0($nptr),%xmm0
-	movq	8*1($nptr),%xmm1
-	movq	8*2($nptr),%xmm3
-	movq	8*3($nptr),%xmm4
-	lea	8*4($nptr),$nptr
-	movdqa	%xmm0,16*0(%r11)
-	movdqa	%xmm1,16*1(%r11)
-	movdqa	%xmm3,16*2(%r11)
-	movdqa	%xmm4,16*3(%r11)
-	lea	16*4(%r11),%r11
-	dec	$i
-	jnz	.Lsqr8x_copy_n
-
+	movq	$nptr, %xmm2		# save pointer to modulus
 	pxor	%xmm0,%xmm0
 	movq	$rptr,%xmm1		# save $rptr
 	movq	%r10, %xmm3		# -$num
 ___
 $code.=<<___ if ($addx);
+	mov	OPENSSL_ia32cap_P+8(%rip),%eax
 	and	\$0x80100,%eax
 	cmp	\$0x80100,%eax
 	jne	.Lsqr8x_nox
@@ -837,7 +817,6 @@ $code.=<<___ if ($addx);

 	pxor	%xmm0,%xmm0
 	lea	48(%rsp),%rax
-	lea	64(%rsp,$num,2),%rdx
 	shr	\$3+2,$num
 	mov	40(%rsp),%rsi		# restore %rsp
 	jmp	.Lsqr8x_zero
@@ -850,7 +829,6 @@ $code.=<<___;

 	pxor	%xmm0,%xmm0
 	lea	48(%rsp),%rax
-	lea	64(%rsp,$num,2),%rdx
 	shr	\$3+2,$num
 	mov	40(%rsp),%rsi		# restore %rsp
 	jmp	.Lsqr8x_zero
@@ -862,11 +840,6 @@ $code.=<<___;
 	movdqa	%xmm0,16*2(%rax)
 	movdqa	%xmm0,16*3(%rax)
 	lea	16*4(%rax),%rax
-	movdqa	%xmm0,16*0(%rdx)	# wipe n
-	movdqa	%xmm0,16*1(%rdx)
-	movdqa	%xmm0,16*2(%rdx)
-	movdqa	%xmm0,16*3(%rdx)
-	lea	16*4(%rdx),%rdx
 	dec	$num
 	jnz	.Lsqr8x_zero


--- a/crypto/bn/asm/x86_64-mont5.pl
+++ b/crypto/bn/asm/x86_64-mont5.pl
--- a/crypto/bn/bn_exp.c
+++ b/crypto/bn/bn_exp.c
@@ -787,8 +787,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
    if (window >= 5) {
        window = 5;             /* ~5% improvement for RSA2048 sign, and even
                                 * for RSA4096 */
-        if ((top & 7) == 0)
-            powerbufLen += 2 * top * sizeof(m->d[0]);
+        /* reserve space for mont->N.d[] copy */
+        powerbufLen += top * sizeof(mont->N.d[0]);
    }
 #endif
    (void)0;
@@ -1008,7 +1008,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
                               const BN_ULONG *not_used, const BN_ULONG *np,
                               const BN_ULONG *n0, int num);

-        BN_ULONG *np = mont->N.d, *n0 = mont->n0, *np2;
+        BN_ULONG *n0 = mont->n0, *np;

        /*
         * BN_to_montgomery can contaminate words above .top [in
@@ -1019,11 +1019,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
        for (i = tmp.top; i < top; i++)
            tmp.d[i] = 0;

-        if (top & 7)
-            np2 = np;
-        else
-            for (np2 = am.d + top, i = 0; i < top; i++)
-                np2[2 * i] = np[i];
+        /*
+         * copy mont->N.d[] to improve cache locality
+         */
+        for (np = am.d + top, i = 0; i < top; i++)
+            np[i] = mont->N.d[i];

        bn_scatter5(tmp.d, top, powerbuf, 0);
        bn_scatter5(am.d, am.top, powerbuf, 1);
@@ -1033,7 +1033,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 # if 0
        for (i = 3; i < 32; i++) {
            /* Calculate a^i = a^(i-1) * a */
-            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
            bn_scatter5(tmp.d, top, powerbuf, i);
        }
 # else
@@ -1044,7 +1044,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
        }
        for (i = 3; i < 8; i += 2) {
            int j;
-            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
            bn_scatter5(tmp.d, top, powerbuf, i);
            for (j = 2 * i; j < 32; j *= 2) {
                bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
@@ -1052,13 +1052,13 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
            }
        }
        for (; i < 16; i += 2) {
-            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
            bn_scatter5(tmp.d, top, powerbuf, i);
            bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
            bn_scatter5(tmp.d, top, powerbuf, 2 * i);
        }
        for (; i < 32; i += 2) {
-            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+            bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
            bn_scatter5(tmp.d, top, powerbuf, i);
        }
 # endif
@@ -1087,11 +1087,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
            while (bits >= 0) {
                wvalue = bn_get_bits5(p->d, bits - 4);
                bits -= 5;
-                bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue);
+                bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
            }
        }

-        ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np2, n0, top);
+        ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top);
        tmp.top = top;
        bn_correct_top(&tmp);
        if (ret) {