提交 8fc8f486 编写于 作者: A Andy Polyakov 提交者: Matt Caswell

crypto/bn/x86_64-mont5.pl: constant-time gather procedure.

At the same time remove miniscule bias in final subtraction.
Performance penalty varies from platform to platform, and even with
key length. For rsa2048 sign it was observed to be 4% for Sandy
Bridge and 7% on Broadwell.

CVE-2016-0702
Reviewed-by: NRichard Levitte <levitte@openssl.org>
Reviewed-by: NRich Salz <rsalz@openssl.org>
上级 d6d422e1
......@@ -775,20 +775,20 @@ bn_sqr8x_mont:
# 4096. this is done to allow memory disambiguation logic
# do its job.
#
lea -64(%rsp,$num,4),%r11
lea -64(%rsp,$num,2),%r11
mov ($n0),$n0 # *n0
sub $aptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lsqr8x_sp_alt
sub %r11,%rsp # align with $aptr
lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num)
lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
jmp .Lsqr8x_sp_done
.align 32
.Lsqr8x_sp_alt:
lea 4096-64(,$num,4),%r10 # 4096-frame-4*$num
lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num)
lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
......@@ -798,37 +798,17 @@ bn_sqr8x_mont:
mov $num,%r10
neg $num
lea 64(%rsp,$num,2),%r11 # copy of modulus
mov $n0, 32(%rsp)
mov %rax, 40(%rsp) # save original %rsp
.Lsqr8x_body:
mov $num,$i
movq %r11, %xmm2 # save pointer to modulus copy
shr \$3+2,$i
mov OPENSSL_ia32cap_P+8(%rip),%eax
jmp .Lsqr8x_copy_n
.align 32
.Lsqr8x_copy_n:
movq 8*0($nptr),%xmm0
movq 8*1($nptr),%xmm1
movq 8*2($nptr),%xmm3
movq 8*3($nptr),%xmm4
lea 8*4($nptr),$nptr
movdqa %xmm0,16*0(%r11)
movdqa %xmm1,16*1(%r11)
movdqa %xmm3,16*2(%r11)
movdqa %xmm4,16*3(%r11)
lea 16*4(%r11),%r11
dec $i
jnz .Lsqr8x_copy_n
movq $nptr, %xmm2 # save pointer to modulus
pxor %xmm0,%xmm0
movq $rptr,%xmm1 # save $rptr
movq %r10, %xmm3 # -$num
___
$code.=<<___ if ($addx);
mov OPENSSL_ia32cap_P+8(%rip),%eax
and \$0x80100,%eax
cmp \$0x80100,%eax
jne .Lsqr8x_nox
......@@ -837,7 +817,6 @@ $code.=<<___ if ($addx);
pxor %xmm0,%xmm0
lea 48(%rsp),%rax
lea 64(%rsp,$num,2),%rdx
shr \$3+2,$num
mov 40(%rsp),%rsi # restore %rsp
jmp .Lsqr8x_zero
......@@ -850,7 +829,6 @@ $code.=<<___;
pxor %xmm0,%xmm0
lea 48(%rsp),%rax
lea 64(%rsp,$num,2),%rdx
shr \$3+2,$num
mov 40(%rsp),%rsi # restore %rsp
jmp .Lsqr8x_zero
......@@ -862,11 +840,6 @@ $code.=<<___;
movdqa %xmm0,16*2(%rax)
movdqa %xmm0,16*3(%rax)
lea 16*4(%rax),%rax
movdqa %xmm0,16*0(%rdx) # wipe n
movdqa %xmm0,16*1(%rdx)
movdqa %xmm0,16*2(%rdx)
movdqa %xmm0,16*3(%rdx)
lea 16*4(%rdx),%rdx
dec $num
jnz .Lsqr8x_zero
......
此差异已折叠。
......@@ -787,8 +787,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
if (window >= 5) {
window = 5; /* ~5% improvement for RSA2048 sign, and even
* for RSA4096 */
if ((top & 7) == 0)
powerbufLen += 2 * top * sizeof(m->d[0]);
/* reserve space for mont->N.d[] copy */
powerbufLen += top * sizeof(mont->N.d[0]);
}
#endif
(void)0;
......@@ -1008,7 +1008,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
const BN_ULONG *not_used, const BN_ULONG *np,
const BN_ULONG *n0, int num);
BN_ULONG *np = mont->N.d, *n0 = mont->n0, *np2;
BN_ULONG *n0 = mont->n0, *np;
/*
* BN_to_montgomery can contaminate words above .top [in
......@@ -1019,11 +1019,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
for (i = tmp.top; i < top; i++)
tmp.d[i] = 0;
if (top & 7)
np2 = np;
else
for (np2 = am.d + top, i = 0; i < top; i++)
np2[2 * i] = np[i];
/*
* copy mont->N.d[] to improve cache locality
*/
for (np = am.d + top, i = 0; i < top; i++)
np[i] = mont->N.d[i];
bn_scatter5(tmp.d, top, powerbuf, 0);
bn_scatter5(am.d, am.top, powerbuf, 1);
......@@ -1033,7 +1033,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
# if 0
for (i = 3; i < 32; i++) {
/* Calculate a^i = a^(i-1) * a */
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
}
# else
......@@ -1044,7 +1044,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
}
for (i = 3; i < 8; i += 2) {
int j;
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
for (j = 2 * i; j < 32; j *= 2) {
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
......@@ -1052,13 +1052,13 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
}
}
for (; i < 16; i += 2) {
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
bn_scatter5(tmp.d, top, powerbuf, 2 * i);
}
for (; i < 32; i += 2) {
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
bn_scatter5(tmp.d, top, powerbuf, i);
}
# endif
......@@ -1087,11 +1087,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
while (bits >= 0) {
wvalue = bn_get_bits5(p->d, bits - 4);
bits -= 5;
bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue);
bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
}
}
ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np2, n0, top);
ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top);
tmp.top = top;
bn_correct_top(&tmp);
if (ret) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册