提交 e7382805 编写于 作者: A Andy Polyakov

Add reference implementation for bn_[mul|sqr]_mont, new candidates for

assembler implementation.
上级 8265328d
...@@ -727,6 +727,8 @@ int RAND_pseudo_bytes(unsigned char *buf,int num); ...@@ -727,6 +727,8 @@ int RAND_pseudo_bytes(unsigned char *buf,int num);
bn_pollute(a); \ bn_pollute(a); \
} }
void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,BN_ULONG n0, int num);
void bn_sqr_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *np,BN_ULONG n0, int num);
BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w); BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w); BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
void bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num); void bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num);
......
...@@ -820,18 +820,95 @@ void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) ...@@ -820,18 +820,95 @@ void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
r[6]=c1; r[6]=c1;
r[7]=c2; r[7]=c2;
} }
#ifdef OPENSSL_BN_ASM_MONT
/*
* This is essentially reference implementation, which may or may not
* result in performance improvement. E.g. on IA-32 this does give 40%
* faster rsa1024 private key operations and 10% faster rsa4096 ones,
* while on AMD64 it improves rsa1024 sign only by 10%, but *worsens*
* rsa4096 sign by 15%. Once again, it's a reference implementation,
* one to be used as start-point for platform-specific assembler.
*/
void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,BN_ULONG n0, int num)
{
BN_ULONG c0,c1,ml,*tp;
#ifdef mul64
BN_ULONG mh;
#endif
volatile BN_ULONG *vp;
int i=0,j;
vp = tp = alloca((num+2)*sizeof(BN_ULONG));
tp[num] = bn_mul_words(tp,ap,num,bp[0]);
tp[num+1] = 0;
goto enter;
for(i=0;i<num;i++)
{
c0 = bn_mul_add_words(tp,ap,num,bp[i]);
c1 = (tp[num] + c0)&BN_MASK2;
tp[num] = c1;
tp[num+1] = (c1<c0?1:0);
enter:
c1 = tp[0];
ml = (c1*n0)&BN_MASK2;
c0 = 0;
#ifdef mul64
mh = HBITS(ml);
ml = LBITS(ml);
mul_add(c1,np[0],ml,mh,c0);
#else
mul_add(c1,ml,np[0],c0);
#endif
for(j=1;j<num;j++)
{
c1 = tp[j];
#ifdef mul64
mul_add(c1,np[j],ml,mh,c0);
#else
mul_add(c1,ml,np[j],c0);
#endif
tp[j-1] = c1&BN_MASK2;
}
c1 = (tp[num] + c0)&BN_MASK2;
tp[num-1] = c1;
tp[num] = tp[num+1] + (c1<c0?1:0);
}
if (tp[num]!=0 || tp[num-1]>=np[num-1])
{
c0 = bn_sub_words(rp,tp,np,num);
if (tp[num]!=0 || c0==0)
{
for(i=0;i<num+2;i++) vp[i] = 0;
return;
}
}
for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0;
vp[num] = 0;
vp[num+1] = 0;
}
void bn_sqr_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *np,BN_ULONG n0, int num)
{
bn_mul_mont(rp,ap,ap,np,n0,num);
}
#endif /* OPENSSL_BN_ASM_MONT */
#else /* !BN_MUL_COMBA */ #else /* !BN_MUL_COMBA */
/* hmm... is it faster just to do a multiply? */ /* hmm... is it faster just to do a multiply? */
#undef bn_sqr_comba4 #undef bn_sqr_comba4
void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
{ {
BN_ULONG t[8]; BN_ULONG t[8];
bn_sqr_normal(r,a,4,t); bn_sqr_normal(r,a,4,t);
} }
#undef bn_sqr_comba8 #undef bn_sqr_comba8
void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
{ {
BN_ULONG t[16]; BN_ULONG t[16];
bn_sqr_normal(r,a,8,t); bn_sqr_normal(r,a,8,t);
...@@ -857,4 +934,49 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) ...@@ -857,4 +934,49 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]); r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
} }
#ifdef OPENSSL_BN_ASM_MONT
void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,BN_ULONG n0, int num)
{
BN_ULONG c0,c1,*tp;
volatile BN_ULONG *vp;
int i=0,j;
vp = tp = alloca((num+2)*sizeof(BN_ULONG));
for(i=0;i<=num;i++) tp[i]=0;
for(i=0;i<num;i++)
{
c0 = bn_mul_add_words(tp,ap,num,bp[i]);
c1 = tp[num] + c0;
tp[num] = c1;
tp[num+1] = (c1<c0?1:0);
c0 = bn_mul_add_words(tp,np,num,tp[0]*n0);
c1 = tp[num] + c0;
tp[num] = c1;
tp[num+1] += (c1<c0?1:0);
for(j=0;j<=num;j++) tp[j]=tp[j+1];
}
if (tp[num]!=0 || tp[num-1]>=np[num-1])
{
c0 = bn_sub_words(rp,tp,np,num);
if (tp[num]!=0 || c0==0)
{
for(i=0;i<num+2;i++) vp[i] = 0;
return;
}
}
for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0;
vp[num] = 0;
vp[num+1] = 0;
}
void bn_sqr_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *np,BN_ULONG n0, int num)
{
bn_mul_mont(rp,ap,ap,np,n0,num);
}
#endif /* OPENSSL_BN_ASM_MONT */
#endif /* !BN_MUL_COMBA */ #endif /* !BN_MUL_COMBA */
...@@ -74,6 +74,22 @@ int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, ...@@ -74,6 +74,22 @@ int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
{ {
BIGNUM *tmp; BIGNUM *tmp;
int ret=0; int ret=0;
#ifdef OPENSSL_BN_ASM_MONT
int num = mont->N.top;
if (num>1 && a->top==num && b->top==num)
{
if (bn_wexpand(r,num) == NULL) return 0;
r->neg = a->neg^b->neg;
r->top = num;
if (a==b)
bn_sqr_mont(r->d,a->d,mont->N.d,mont->n0,num);
else
bn_mul_mont(r->d,a->d,b->d,mont->N.d,mont->n0,num);
bn_fix_top(r);
return 1;
}
#endif
BN_CTX_start(ctx); BN_CTX_start(ctx);
tmp = BN_CTX_get(ctx); tmp = BN_CTX_get(ctx);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册