#include #include #include #include #include #include #include #include "internal/numbers.h" #include "internal/constant_time.h" #include "bn_local.h" # if BN_BYTES == 8 typedef uint64_t limb_t; # if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16 /* nonstandard; implemented by gcc on 64-bit platforms */ typedef __uint128_t limb2_t; # define HAVE_LIMB2_T # endif # define LIMB_BIT_SIZE 64 # define LIMB_BYTE_SIZE 8 # elif BN_BYTES == 4 typedef uint32_t limb_t; typedef uint64_t limb2_t; # define LIMB_BIT_SIZE 32 # define LIMB_BYTE_SIZE 4 # define HAVE_LIMB2_T # else # error "Not supported" # endif /* * For multiplication we're using schoolbook multiplication, * so if we have two numbers, each with 6 "digits" (words) * the multiplication is calculated as follows: * A B C D E F * x I J K L M N * -------------- * N*F * N*E * N*D * N*C * N*B * N*A * M*F * M*E * M*D * M*C * M*B * M*A * L*F * L*E * L*D * L*C * L*B * L*A * K*F * K*E * K*D * K*C * K*B * K*A * J*F * J*E * J*D * J*C * J*B * J*A * I*F * I*E * I*D * I*C * I*B * + I*A * ========================== * N*B N*D N*F * + N*A N*C N*E * + M*B M*D M*F * + M*A M*C M*E * + L*B L*D L*F * + L*A L*C L*E * + K*B K*D K*F * + K*A K*C K*E * + J*B J*D J*F * + J*A J*C J*E * + I*B I*D I*F * + I*A I*C I*E * * 1+1 1+3 1+5 * 1+0 1+2 1+4 * 0+1 0+3 0+5 * 0+0 0+2 0+4 * * 0 1 2 3 4 5 6 * which requires n^2 multiplications and 2n full length additions * as we can keep every other result of limb multiplication in two separate * limbs */ #if defined HAVE_LIMB2_T static ossl_inline void _mul_limb(limb_t *hi, limb_t *lo, limb_t a, limb_t b) { limb2_t t; /* * this is idiomatic code to tell compiler to use the native mul * those three lines will actually compile to single instruction */ t = (limb2_t)a * b; *hi = t >> LIMB_BIT_SIZE; *lo = (limb_t)t; } #elif (BN_BYTES == 8) && (defined _MSC_VER) /* https://learn.microsoft.com/en-us/cpp/intrinsics/umul128?view=msvc-170 */ #pragma intrinsic(_umul128) static ossl_inline void _mul_limb(limb_t *hi, limb_t *lo, limb_t a, limb_t b) { *lo = _umul128(a, b, hi); } #else /* * if the compiler doesn't have either a 128bit data type nor a "return * high 64 bits of multiplication" */ static ossl_inline void _mul_limb(limb_t *hi, limb_t *lo, limb_t a, limb_t b) { limb_t a_low = (limb_t)(uint32_t)a; limb_t a_hi = a >> 32; limb_t b_low = (limb_t)(uint32_t)b; limb_t b_hi = b >> 32; limb_t p0 = a_low * b_low; limb_t p1 = a_low * b_hi; limb_t p2 = a_hi * b_low; limb_t p3 = a_hi * b_hi; uint32_t cy = (uint32_t)(((p0 >> 32) + (uint32_t)p1 + (uint32_t)p2) >> 32); *lo = p0 + (p1 << 32) + (p2 << 32); *hi = p3 + (p1 >> 32) + (p2 >> 32) + cy; } #endif /* add two limbs with carry in, return carry out */ static ossl_inline limb_t _add_limb(limb_t *ret, limb_t a, limb_t b, limb_t carry) { limb_t carry1, carry2, t; /* * `c = a + b; if (c < a)` is idiomatic code that makes compilers * use add with carry on assembly level */ *ret = a + carry; if (*ret < a) carry1 = 1; else carry1 = 0; t = *ret; *ret = t + b; if (*ret < t) carry2 = 1; else carry2 = 0; return carry1 + carry2; } /* * add two numbers of the same size, return overflow * * add a to b, place result in ret; all arrays need to be n limbs long * return overflow from addition (0 or 1) */ static ossl_inline limb_t add(limb_t *ret, limb_t *a, limb_t *b, size_t n) { limb_t c = 0; ossl_ssize_t i; for(i = n - 1; i > -1; i--) c = _add_limb(&ret[i], a[i], b[i], c); return c; } /* * return number of limbs necessary for temporary values * when multiplying numbers n limbs large */ static ossl_inline size_t mul_limb_numb(size_t n) { return 2 * n * 2; } /* * multiply two numbers of the same size * * multiply a by b, place result in ret; a and b need to be n limbs long * ret needs to be 2*n limbs long, tmp needs to be mul_limb_numb(n) limbs * long */ static void limb_mul(limb_t *ret, limb_t *a, limb_t *b, size_t n, limb_t *tmp) { limb_t *r_odd, *r_even; size_t i, j, k; r_odd = tmp; r_even = &tmp[2 * n]; memset(ret, 0, 2 * n * sizeof(limb_t)); for (i = 0; i < n; i++) { for (k = 0; k < i + n + 1; k++) { r_even[k] = 0; r_odd[k] = 0; } for (j = 0; j < n; j++) { /* * place results from even and odd limbs in separate arrays so that * we don't have to calculate overflow every time we get individual * limb multiplication result */ if (j % 2 == 0) _mul_limb(&r_even[i + j], &r_even[i + j + 1], a[i], b[j]); else _mul_limb(&r_odd[i + j], &r_odd[i + j + 1], a[i], b[j]); } /* * skip the least significant limbs when adding multiples of * more significant limbs (they're zero anyway) */ add(ret, ret, r_even, n + i + 1); add(ret, ret, r_odd, n + i + 1); } } /* modifies the value in place by performing a right shift by one bit */ static ossl_inline void rshift1(limb_t *val, size_t n) { limb_t shift_in = 0, shift_out = 0; size_t i; for (i = 0; i < n; i++) { shift_out = val[i] & 1; val[i] = shift_in << (LIMB_BIT_SIZE - 1) | (val[i] >> 1); shift_in = shift_out; } } /* extend the LSB of flag to all bits of limb */ static ossl_inline limb_t mk_mask(limb_t flag) { flag |= flag << 1; flag |= flag << 2; flag |= flag << 4; flag |= flag << 8; flag |= flag << 16; #if (LIMB_BYTE_SIZE == 8) flag |= flag << 32; #endif return flag; } /* * copy from either a or b to ret based on flag * when flag == 0, then copies from b * when flag == 1, then copies from a */ static ossl_inline void cselect(limb_t flag, limb_t *ret, limb_t *a, limb_t *b, size_t n) { /* * would be more efficient with non volatile mask, but then gcc * generates code with jumps */ volatile limb_t mask; size_t i; mask = mk_mask(flag); for (i = 0; i < n; i++) { #if (LIMB_BYTE_SIZE == 8) ret[i] = constant_time_select_64(mask, a[i], b[i]); #else ret[i] = constant_time_select_32(mask, a[i], b[i]); #endif } } static limb_t _sub_limb(limb_t *ret, limb_t a, limb_t b, limb_t borrow) { limb_t borrow1, borrow2, t; /* * while it doesn't look constant-time, this is idiomatic code * to tell compilers to use the carry bit from subtraction */ *ret = a - borrow; if (*ret > a) borrow1 = 1; else borrow1 = 0; t = *ret; *ret = t - b; if (*ret > t) borrow2 = 1; else borrow2 = 0; return borrow1 + borrow2; } /* * place the result of a - b into ret, return the borrow bit. * All arrays need to be n limbs long */ static limb_t sub(limb_t *ret, limb_t *a, limb_t *b, size_t n) { limb_t borrow = 0; ossl_ssize_t i; for (i = n - 1; i > -1; i--) borrow = _sub_limb(&ret[i], a[i], b[i], borrow); return borrow; } /* return the number of limbs necessary to allocate for the mod() tmp operand */ static ossl_inline size_t mod_limb_numb(size_t anum, size_t modnum) { return (anum + modnum) * 3; } /* * calculate a % mod, place the result in ret * size of a is defined by anum, size of ret and mod is modnum, * size of tmp is returned by mod_limb_numb() */ static void mod(limb_t *ret, limb_t *a, size_t anum, limb_t *mod, size_t modnum, limb_t *tmp) { limb_t *atmp, *modtmp, *rettmp; limb_t res; size_t i; memset(tmp, 0, mod_limb_numb(anum, modnum) * LIMB_BYTE_SIZE); atmp = tmp; modtmp = &tmp[anum + modnum]; rettmp = &tmp[(anum + modnum) * 2]; for (i = modnum; i 0; i--, rp--) { v = _mul_add_limb(rp, mod, modnum, rp[modnum - 1] * ni0, tmp2); v = v + carry + rp[-1]; carry |= (v != rp[-1]); carry &= (v <= rp[-1]); rp[-1] = v; } /* perform the final reduction by mod... */ carry -= sub(ret, rp, mod, modnum); /* ...conditionally */ cselect(carry, ret, rp, ret, modnum); } /* allocated buffer should be freed afterwards */ static void BN_to_limb(const BIGNUM *bn, limb_t *buf, size_t limbs) { int i; int real_limbs = (BN_num_bytes(bn) + LIMB_BYTE_SIZE - 1) / LIMB_BYTE_SIZE; limb_t *ptr = buf + (limbs - real_limbs); for (i = 0; i < real_limbs; i++) ptr[i] = bn->d[real_limbs - i - 1]; } #if LIMB_BYTE_SIZE == 8 static ossl_inline uint64_t be64(uint64_t host) { const union { long one; char little; } is_endian = { 1 }; if (is_endian.little) { uint64_t big = 0; big |= (host & 0xff00000000000000) >> 56; big |= (host & 0x00ff000000000000) >> 40; big |= (host & 0x0000ff0000000000) >> 24; big |= (host & 0x000000ff00000000) >> 8; big |= (host & 0x00000000ff000000) << 8; big |= (host & 0x0000000000ff0000) << 24; big |= (host & 0x000000000000ff00) << 40; big |= (host & 0x00000000000000ff) << 56; return big; } else { return host; } } #else /* Not all platforms have htobe32(). */ static ossl_inline uint32_t be32(uint32_t host) { const union { long one; char little; } is_endian = { 1 }; if (is_endian.little) { uint32_t big = 0; big |= (host & 0xff000000) >> 24; big |= (host & 0x00ff0000) >> 8; big |= (host & 0x0000ff00) << 8; big |= (host & 0x000000ff) << 24; return big; } else { return host; } } #endif /* * We assume that intermediate, possible_arg2, blinding, and ctx are used * similar to BN_BLINDING_invert_ex() arguments. * to_mod is RSA modulus. * buf and num is the serialization buffer and its length. * * Here we use classic/Montgomery multiplication and modulo. After the calculation finished * we serialize the new structure instead of BIGNUMs taking endianness into account. */ int ossl_bn_rsa_do_unblind(const BIGNUM *intermediate, const BN_BLINDING *blinding, const BIGNUM *possible_arg2, const BIGNUM *to_mod, BN_CTX *ctx, unsigned char *buf, int num) { limb_t *l_im = NULL, *l_mul = NULL, *l_mod = NULL; limb_t *l_ret = NULL, *l_tmp = NULL, l_buf; size_t l_im_count = 0, l_mul_count = 0, l_size = 0, l_mod_count = 0; size_t l_tmp_count = 0; int ret = 0; size_t i; unsigned char *tmp; const BIGNUM *arg1 = intermediate; const BIGNUM *arg2 = (possible_arg2 == NULL) ? blinding->Ai : possible_arg2; l_im_count = (BN_num_bytes(arg1) + LIMB_BYTE_SIZE - 1) / LIMB_BYTE_SIZE; l_mul_count = (BN_num_bytes(arg2) + LIMB_BYTE_SIZE - 1) / LIMB_BYTE_SIZE; l_mod_count = (BN_num_bytes(to_mod) + LIMB_BYTE_SIZE - 1) / LIMB_BYTE_SIZE; l_size = l_im_count > l_mul_count ? l_im_count : l_mul_count; l_im = OPENSSL_zalloc(l_size * LIMB_BYTE_SIZE); l_mul = OPENSSL_zalloc(l_size * LIMB_BYTE_SIZE); l_mod = OPENSSL_zalloc(l_mod_count * LIMB_BYTE_SIZE); if ((l_im == NULL) || (l_mul == NULL) || (l_mod == NULL)) goto err; BN_to_limb(arg1, l_im, l_size); BN_to_limb(arg2, l_mul, l_size); BN_to_limb(to_mod, l_mod, l_mod_count); l_ret = OPENSSL_malloc(2 * l_size * LIMB_BYTE_SIZE); if (blinding->m_ctx != NULL) { l_tmp_count = mul_limb_numb(l_size) > mod_montgomery_limb_numb(l_mod_count) ? mul_limb_numb(l_size) : mod_montgomery_limb_numb(l_mod_count); l_tmp = OPENSSL_malloc(l_tmp_count * LIMB_BYTE_SIZE); } else { l_tmp_count = mul_limb_numb(l_size) > mod_limb_numb(2 * l_size, l_mod_count) ? mul_limb_numb(l_size) : mod_limb_numb(2 * l_size, l_mod_count); l_tmp = OPENSSL_malloc(l_tmp_count * LIMB_BYTE_SIZE); } if ((l_ret == NULL) || (l_tmp == NULL)) goto err; if (blinding->m_ctx != NULL) { limb_mul(l_ret, l_im, l_mul, l_size, l_tmp); mod_montgomery(l_ret, l_ret, 2 * l_size, l_mod, l_mod_count, blinding->m_ctx->n0[0], l_tmp); } else { limb_mul(l_ret, l_im, l_mul, l_size, l_tmp); mod(l_ret, l_ret, 2 * l_size, l_mod, l_mod_count, l_tmp); } /* modulus size in bytes can be equal to num but after limbs conversion it becomes bigger */ if (num < BN_num_bytes(to_mod)) { BNerr(BN_F_OSSL_BN_RSA_DO_UNBLIND, ERR_R_PASSED_INVALID_ARGUMENT); goto err; } memset(buf, 0, num); tmp = buf + num - BN_num_bytes(to_mod); for (i = 0; i < l_mod_count; i++) { #if LIMB_BYTE_SIZE == 8 l_buf = be64(l_ret[i]); #else l_buf = be32(l_ret[i]); #endif if (i == 0) { int delta = LIMB_BYTE_SIZE - ((l_mod_count * LIMB_BYTE_SIZE) - num); memcpy(tmp, ((char *)&l_buf) + LIMB_BYTE_SIZE - delta, delta); tmp += delta; } else { memcpy(tmp, &l_buf, LIMB_BYTE_SIZE); tmp += LIMB_BYTE_SIZE; } } ret = num; err: OPENSSL_free(l_im); OPENSSL_free(l_mul); OPENSSL_free(l_mod); OPENSSL_free(l_tmp); OPENSSL_free(l_ret); return ret; }