diff --git a/crypto/ec/curve448/arch_arm_32/arch_intrinsics.h b/crypto/ec/curve448/arch_arm_32/arch_intrinsics.h deleted file mode 100644 index a98ffe4d7e35b1cd2ce4aae4a0403b83d8897f30..0000000000000000000000000000000000000000 --- a/crypto/ec/curve448/arch_arm_32/arch_intrinsics.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. - * Copyright 2016 Cryptography Research, Inc. - * - * Licensed under the OpenSSL license (the "License"). You may not use - * this file except in compliance with the License. You can obtain a copy - * in the file LICENSE in the source distribution or at - * https://www.openssl.org/source/license.html - * - * Originally written by Mike Hamburg - */ - -#ifndef __ARCH_ARM_32_ARCH_INTRINSICS_H__ -# define __ARCH_ARM_32_ARCH_INTRINSICS_H__ - -# define ARCH_WORD_BITS 32 - -static __inline__ __attribute((always_inline, unused)) -uint32_t word_is_zero(uint32_t a) -{ - uint32_t ret; - - asm("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc"); - return ret; -} - -static __inline__ __attribute((always_inline, unused)) -uint64_t widemul(uint32_t a, uint32_t b) -{ - /* - * Could be UMULL, but it's hard to express to CC that the registers must - * be different - */ - return ((uint64_t)a) * b; -} - -#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */ diff --git a/crypto/ec/curve448/arch_arm_32/f_impl.c b/crypto/ec/curve448/arch_arm_32/f_impl.c deleted file mode 100644 index 8a2b0886b5c3d7770e2460472e5c5870f75f84cc..0000000000000000000000000000000000000000 --- a/crypto/ec/curve448/arch_arm_32/f_impl.c +++ /dev/null @@ -1,846 +0,0 @@ -/* - * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. - * Copyright 2014 Cryptography Research, Inc. - * - * Licensed under the OpenSSL license (the "License"). You may not use - * this file except in compliance with the License. You can obtain a copy - * in the file LICENSE in the source distribution or at - * https://www.openssl.org/source/license.html - * - * Originally written by Mike Hamburg - */ - -#include "field.h" - -static inline void __attribute__ ((gnu_inline, always_inline)) - smlal(uint64_t *acc, const uint32_t a, const uint32_t b) -{ - -#ifdef __ARMEL__ - uint32_t lo = *acc, hi = (*acc) >> 32; - - __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]" - : [lo]"+&r"(lo), [hi]"+&r"(hi) - : [a]"r"(a), [b]"r"(b)); - - - *acc = lo + (((uint64_t)hi) << 32); -#else - *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)b; -#endif -} - -static inline void __attribute__ ((gnu_inline, always_inline)) - smlal2(uint64_t *acc, const uint32_t a, const uint32_t b) -{ -#ifdef __ARMEL__ - uint32_t lo = *acc, hi = (*acc) >> 32; - - __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]" - : [lo]"+&r"(lo), [hi]"+&r"(hi) - : [a]"r"(a), [b]"r"(2 * b)); - - - - *acc = lo + (((uint64_t)hi) << 32); -#else - *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)(b * 2); -#endif -} - -static inline void __attribute__ ((gnu_inline, always_inline)) - smull(uint64_t *acc, const uint32_t a, const uint32_t b) -{ -#ifdef __ARMEL__ - uint32_t lo, hi; - - __asm__ __volatile__ ("smull %[lo], %[hi], %[a], %[b]" - : [lo]"=&r"(lo), [hi]"=&r"(hi) - : [a]"r"(a), [b]"r"(b)); - - *acc = lo + (((uint64_t)hi) << 32); -#else - *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)b; -#endif -} - -static inline void __attribute__ ((gnu_inline, always_inline)) - smull2(uint64_t *acc, const uint32_t a, const uint32_t b) -{ -#ifdef __ARMEL__ - uint32_t lo, hi; - - __asm__ /*__volatile__*/ ("smull %[lo], %[hi], %[a], %[b]" - : [lo]"=&r"(lo), [hi]"=&r"(hi) - : [a]"r"(a), [b]"r"(2*b)); - - *acc = lo + (((uint64_t)hi) << 32); -#else - *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)(b * 2); -#endif -} - -void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs) -{ - - const uint32_t *a = as->limb, *b = bs->limb; - uint32_t *c = cs->limb; - - uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1; - uint32_t mask = (1 << 28) - 1; - - uint32_t aa[8], bm[8]; - - int i; - for (i = 0; i < 8; i++) { - aa[i] = a[i] + a[i + 8]; - bm[i] = b[i] - b[i + 8]; - } - - uint32_t ax, bx; - { - /* t^3 terms */ - smull(&accum1, ax = aa[1], bx = b[15]); - smull(&accum3, ax = aa[2], bx); - smlal(&accum1, ax, bx = b[14]); - smlal(&accum3, ax = aa[3], bx); - smlal(&accum1, ax, bx = b[13]); - smlal(&accum3, ax = aa[4], bx); - smlal(&accum1, ax, bx = b[12]); - smlal(&accum3, ax = aa[5], bx); - smlal(&accum1, ax, bx = b[11]); - smlal(&accum3, ax = aa[6], bx); - smlal(&accum1, ax, bx = b[10]); - smlal(&accum3, ax = aa[7], bx); - smlal(&accum1, ax, bx = b[9]); - - accum0 = accum1; - accum2 = accum3; - - /* t^2 terms */ - smlal(&accum2, ax = aa[0], bx); - smlal(&accum0, ax, bx = b[8]); - smlal(&accum2, ax = aa[1], bx); - - smlal(&accum0, ax = a[9], bx = b[7]); - smlal(&accum2, ax = a[10], bx); - smlal(&accum0, ax, bx = b[6]); - smlal(&accum2, ax = a[11], bx); - smlal(&accum0, ax, bx = b[5]); - smlal(&accum2, ax = a[12], bx); - smlal(&accum0, ax, bx = b[4]); - smlal(&accum2, ax = a[13], bx); - smlal(&accum0, ax, bx = b[3]); - smlal(&accum2, ax = a[14], bx); - smlal(&accum0, ax, bx = b[2]); - smlal(&accum2, ax = a[15], bx); - smlal(&accum0, ax, bx = b[1]); - - /* t terms */ - accum1 += accum0; - accum3 += accum2; - smlal(&accum3, ax = a[8], bx); - smlal(&accum1, ax, bx = b[0]); - smlal(&accum3, ax = a[9], bx); - - smlal(&accum1, ax = a[1], bx = bm[7]); - smlal(&accum3, ax = a[2], bx); - smlal(&accum1, ax, bx = bm[6]); - smlal(&accum3, ax = a[3], bx); - smlal(&accum1, ax, bx = bm[5]); - smlal(&accum3, ax = a[4], bx); - smlal(&accum1, ax, bx = bm[4]); - smlal(&accum3, ax = a[5], bx); - smlal(&accum1, ax, bx = bm[3]); - smlal(&accum3, ax = a[6], bx); - smlal(&accum1, ax, bx = bm[2]); - smlal(&accum3, ax = a[7], bx); - smlal(&accum1, ax, bx = bm[1]); - - /* 1 terms */ - smlal(&accum2, ax = a[0], bx); - smlal(&accum0, ax, bx = bm[0]); - smlal(&accum2, ax = a[1], bx); - - accum2 += accum0 >> 28; - accum3 += accum1 >> 28; - - c[0] = ((uint32_t)(accum0)) & mask; - c[1] = ((uint32_t)(accum2)) & mask; - c[8] = ((uint32_t)(accum1)) & mask; - c[9] = ((uint32_t)(accum3)) & mask; - - accumC0 = accum2 >> 28; - accumC1 = accum3 >> 28; - } - { - /* t^3 terms */ - smull(&accum1, ax = aa[3], bx = b[15]); - smull(&accum3, ax = aa[4], bx); - smlal(&accum1, ax, bx = b[14]); - smlal(&accum3, ax = aa[5], bx); - smlal(&accum1, ax, bx = b[13]); - smlal(&accum3, ax = aa[6], bx); - smlal(&accum1, ax, bx = b[12]); - smlal(&accum3, ax = aa[7], bx); - smlal(&accum1, ax, bx = b[11]); - - accum0 = accum1; - accum2 = accum3; - - /* t^2 terms */ - smlal(&accum2, ax = aa[0], bx); - smlal(&accum0, ax, bx = b[10]); - smlal(&accum2, ax = aa[1], bx); - smlal(&accum0, ax, bx = b[9]); - smlal(&accum2, ax = aa[2], bx); - smlal(&accum0, ax, bx = b[8]); - smlal(&accum2, ax = aa[3], bx); - - smlal(&accum0, ax = a[11], bx = b[7]); - smlal(&accum2, ax = a[12], bx); - smlal(&accum0, ax, bx = b[6]); - smlal(&accum2, ax = a[13], bx); - smlal(&accum0, ax, bx = b[5]); - smlal(&accum2, ax = a[14], bx); - smlal(&accum0, ax, bx = b[4]); - smlal(&accum2, ax = a[15], bx); - smlal(&accum0, ax, bx = b[3]); - - /* t terms */ - accum1 += accum0; - accum3 += accum2; - smlal(&accum3, ax = a[8], bx); - smlal(&accum1, ax, bx = b[2]); - smlal(&accum3, ax = a[9], bx); - smlal(&accum1, ax, bx = b[1]); - smlal(&accum3, ax = a[10], bx); - smlal(&accum1, ax, bx = b[0]); - smlal(&accum3, ax = a[11], bx); - - smlal(&accum1, ax = a[3], bx = bm[7]); - smlal(&accum3, ax = a[4], bx); - smlal(&accum1, ax, bx = bm[6]); - smlal(&accum3, ax = a[5], bx); - smlal(&accum1, ax, bx = bm[5]); - smlal(&accum3, ax = a[6], bx); - smlal(&accum1, ax, bx = bm[4]); - smlal(&accum3, ax = a[7], bx); - smlal(&accum1, ax, bx = bm[3]); - - /* 1 terms */ - smlal(&accum2, ax = a[0], bx); - smlal(&accum0, ax, bx = bm[2]); - smlal(&accum2, ax = a[1], bx); - smlal(&accum0, ax, bx = bm[1]); - smlal(&accum2, ax = a[2], bx); - smlal(&accum0, ax, bx = bm[0]); - smlal(&accum2, ax = a[3], bx); - - accum0 += accumC0; - accum1 += accumC1; - accum2 += accum0 >> 28; - accum3 += accum1 >> 28; - - c[2] = ((uint32_t)(accum0)) & mask; - c[3] = ((uint32_t)(accum2)) & mask; - c[10] = ((uint32_t)(accum1)) & mask; - c[11] = ((uint32_t)(accum3)) & mask; - - accumC0 = accum2 >> 28; - accumC1 = accum3 >> 28; - } - { - - /* t^3 terms */ - smull(&accum1, ax = aa[5], bx = b[15]); - smull(&accum3, ax = aa[6], bx); - smlal(&accum1, ax, bx = b[14]); - smlal(&accum3, ax = aa[7], bx); - smlal(&accum1, ax, bx = b[13]); - - accum0 = accum1; - accum2 = accum3; - - /* t^2 terms */ - - smlal(&accum2, ax = aa[0], bx); - smlal(&accum0, ax, bx = b[12]); - smlal(&accum2, ax = aa[1], bx); - smlal(&accum0, ax, bx = b[11]); - smlal(&accum2, ax = aa[2], bx); - smlal(&accum0, ax, bx = b[10]); - smlal(&accum2, ax = aa[3], bx); - smlal(&accum0, ax, bx = b[9]); - smlal(&accum2, ax = aa[4], bx); - smlal(&accum0, ax, bx = b[8]); - smlal(&accum2, ax = aa[5], bx); - - smlal(&accum0, ax = a[13], bx = b[7]); - smlal(&accum2, ax = a[14], bx); - smlal(&accum0, ax, bx = b[6]); - smlal(&accum2, ax = a[15], bx); - smlal(&accum0, ax, bx = b[5]); - - /* t terms */ - accum1 += accum0; - accum3 += accum2; - - smlal(&accum3, ax = a[8], bx); - smlal(&accum1, ax, bx = b[4]); - smlal(&accum3, ax = a[9], bx); - smlal(&accum1, ax, bx = b[3]); - smlal(&accum3, ax = a[10], bx); - smlal(&accum1, ax, bx = b[2]); - smlal(&accum3, ax = a[11], bx); - smlal(&accum1, ax, bx = b[1]); - smlal(&accum3, ax = a[12], bx); - smlal(&accum1, ax, bx = b[0]); - smlal(&accum3, ax = a[13], bx); - - smlal(&accum1, ax = a[5], bx = bm[7]); - smlal(&accum3, ax = a[6], bx); - smlal(&accum1, ax, bx = bm[6]); - smlal(&accum3, ax = a[7], bx); - smlal(&accum1, ax, bx = bm[5]); - - /* 1 terms */ - - smlal(&accum2, ax = a[0], bx); - smlal(&accum0, ax, bx = bm[4]); - smlal(&accum2, ax = a[1], bx); - smlal(&accum0, ax, bx = bm[3]); - smlal(&accum2, ax = a[2], bx); - smlal(&accum0, ax, bx = bm[2]); - smlal(&accum2, ax = a[3], bx); - smlal(&accum0, ax, bx = bm[1]); - smlal(&accum2, ax = a[4], bx); - smlal(&accum0, ax, bx = bm[0]); - smlal(&accum2, ax = a[5], bx); - - accum0 += accumC0; - accum1 += accumC1; - accum2 += accum0 >> 28; - accum3 += accum1 >> 28; - - c[4] = ((uint32_t)(accum0)) & mask; - c[5] = ((uint32_t)(accum2)) & mask; - c[12] = ((uint32_t)(accum1)) & mask; - c[13] = ((uint32_t)(accum3)) & mask; - - accumC0 = accum2 >> 28; - accumC1 = accum3 >> 28; - } - { - - /* t^3 terms */ - smull(&accum1, ax = aa[7], bx = b[15]); - accum0 = accum1; - - /* t^2 terms */ - - smull(&accum2, ax = aa[0], bx); - smlal(&accum0, ax, bx = b[14]); - smlal(&accum2, ax = aa[1], bx); - smlal(&accum0, ax, bx = b[13]); - smlal(&accum2, ax = aa[2], bx); - smlal(&accum0, ax, bx = b[12]); - smlal(&accum2, ax = aa[3], bx); - smlal(&accum0, ax, bx = b[11]); - smlal(&accum2, ax = aa[4], bx); - smlal(&accum0, ax, bx = b[10]); - smlal(&accum2, ax = aa[5], bx); - smlal(&accum0, ax, bx = b[9]); - smlal(&accum2, ax = aa[6], bx); - smlal(&accum0, ax, bx = b[8]); - smlal(&accum2, ax = aa[7], bx); - - smlal(&accum0, ax = a[15], bx = b[7]); - - /* t terms */ - accum1 += accum0; - accum3 = accum2; - - smlal(&accum3, ax = a[8], bx); - smlal(&accum1, ax, bx = b[6]); - smlal(&accum3, ax = a[9], bx); - smlal(&accum1, ax, bx = b[5]); - smlal(&accum3, ax = a[10], bx); - smlal(&accum1, ax, bx = b[4]); - smlal(&accum3, ax = a[11], bx); - smlal(&accum1, ax, bx = b[3]); - smlal(&accum3, ax = a[12], bx); - smlal(&accum1, ax, bx = b[2]); - smlal(&accum3, ax = a[13], bx); - smlal(&accum1, ax, bx = b[1]); - smlal(&accum3, ax = a[14], bx); - smlal(&accum1, ax, bx = b[0]); - smlal(&accum3, ax = a[15], bx); - - smlal(&accum1, ax = a[7], bx = bm[7]); - - /* 1 terms */ - - smlal(&accum2, ax = a[0], bx); - smlal(&accum0, ax, bx = bm[6]); - smlal(&accum2, ax = a[1], bx); - smlal(&accum0, ax, bx = bm[5]); - smlal(&accum2, ax = a[2], bx); - smlal(&accum0, ax, bx = bm[4]); - smlal(&accum2, ax = a[3], bx); - smlal(&accum0, ax, bx = bm[3]); - smlal(&accum2, ax = a[4], bx); - smlal(&accum0, ax, bx = bm[2]); - smlal(&accum2, ax = a[5], bx); - smlal(&accum0, ax, bx = bm[1]); - smlal(&accum2, ax = a[6], bx); - smlal(&accum0, ax, bx = bm[0]); - smlal(&accum2, ax = a[7], bx); - - accum0 += accumC0; - accum1 += accumC1; - accum2 += accum0 >> 28; - accum3 += accum1 >> 28; - - c[6] = ((uint32_t)(accum0)) & mask; - c[7] = ((uint32_t)(accum2)) & mask; - c[14] = ((uint32_t)(accum1)) & mask; - c[15] = ((uint32_t)(accum3)) & mask; - - accum0 = accum2 >> 28; - accum1 = accum3 >> 28; - } - - accum0 += accum1; - accum0 += c[8]; - accum1 += c[0]; - c[8] = ((uint32_t)(accum0)) & mask; - c[0] = ((uint32_t)(accum1)) & mask; - - accum0 >>= 28; - accum1 >>= 28; - c[9] += ((uint32_t)(accum0)); - c[1] += ((uint32_t)(accum1)); -} - -void gf_sqr(gf_s * __restrict__ cs, const gf as) -{ - const uint32_t *a = as->limb; - uint32_t *c = cs->limb; - - uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1, tmp; - uint32_t mask = (1 << 28) - 1; - - uint32_t bm[8]; - - int i; - for (i = 0; i < 8; i++) { - bm[i] = a[i] - a[i + 8]; - } - - uint32_t ax, bx; - { - /* t^3 terms */ - smull2(&accum1, ax = a[9], bx = a[15]); - smull2(&accum3, ax = a[10], bx); - smlal2(&accum1, ax, bx = a[14]); - smlal2(&accum3, ax = a[11], bx); - smlal2(&accum1, ax, bx = a[13]); - smlal2(&accum3, ax = a[12], bx); - smlal(&accum1, ax, ax); - - accum0 = accum1; - accum2 = accum3; - - /* t^2 terms */ - smlal2(&accum2, ax = a[8], a[9]); - smlal(&accum0, ax, ax); - - smlal2(&accum0, ax = a[1], bx = a[7]); - smlal2(&accum2, ax = a[2], bx); - smlal2(&accum0, ax, bx = a[6]); - smlal2(&accum2, ax = a[3], bx); - smlal2(&accum0, ax, bx = a[5]); - smlal2(&accum2, ax = a[4], bx); - smlal(&accum0, ax, ax); - - /* t terms */ - accum1 += accum0; - accum3 += accum2; - smlal2(&accum3, ax = a[0], bx = a[1]); - smlal(&accum1, ax, ax); - - accum1 = -accum1; - accum3 = -accum3; - accum2 = -accum2; - accum0 = -accum0; - - smlal2(&accum1, ax = bm[1], bx = bm[7]); - smlal2(&accum3, ax = bm[2], bx); - smlal2(&accum1, ax, bx = bm[6]); - smlal2(&accum3, ax = bm[3], bx); - smlal2(&accum1, ax, bx = bm[5]); - smlal2(&accum3, ax = bm[4], bx); - smlal(&accum1, ax, ax); - - /* 1 terms */ - smlal2(&accum2, ax = bm[0], bx = bm[1]); - smlal(&accum0, ax, ax); - - tmp = -accum3; - accum3 = tmp - accum2; - accum2 = tmp; - tmp = -accum1; - accum1 = tmp - accum0; - accum0 = tmp; - - accum2 += accum0 >> 28; - accum3 += accum1 >> 28; - - c[0] = ((uint32_t)(accum0)) & mask; - c[1] = ((uint32_t)(accum2)) & mask; - c[8] = ((uint32_t)(accum1)) & mask; - c[9] = ((uint32_t)(accum3)) & mask; - - accumC0 = accum2 >> 28; - accumC1 = accum3 >> 28; - } - { - /* t^3 terms */ - smull2(&accum1, ax = a[11], bx = a[15]); - smull2(&accum3, ax = a[12], bx); - smlal2(&accum1, ax, bx = a[14]); - smlal2(&accum3, ax = a[13], bx); - smlal(&accum1, ax, ax); - - accum0 = accum1; - accum2 = accum3; - - /* t^2 terms */ - smlal2(&accum2, ax = a[8], bx = a[11]); - smlal2(&accum0, ax, bx = a[10]); - smlal2(&accum2, ax = a[9], bx); - smlal(&accum0, ax, ax); - - smlal2(&accum0, ax = a[3], bx = a[7]); - smlal2(&accum2, ax = a[4], bx); - smlal2(&accum0, ax, bx = a[6]); - smlal2(&accum2, ax = a[5], bx); - smlal(&accum0, ax, ax); - - /* t terms */ - accum1 += accum0; - accum3 += accum2; - smlal2(&accum3, ax = a[0], bx = a[3]); - smlal2(&accum1, ax, bx = a[2]); - smlal2(&accum3, ax = a[1], bx); - smlal(&accum1, ax, ax); - - accum1 = -accum1; - accum3 = -accum3; - accum2 = -accum2; - accum0 = -accum0; - - smlal2(&accum1, ax = bm[3], bx = bm[7]); - smlal2(&accum3, ax = bm[4], bx); - smlal2(&accum1, ax, bx = bm[6]); - smlal2(&accum3, ax = bm[5], bx); - smlal(&accum1, ax, ax); - - /* 1 terms */ - smlal2(&accum2, ax = bm[0], bx = bm[3]); - smlal2(&accum0, ax, bx = bm[2]); - smlal2(&accum2, ax = bm[1], bx); - smlal(&accum0, ax, ax); - - tmp = -accum3; - accum3 = tmp - accum2; - accum2 = tmp; - tmp = -accum1; - accum1 = tmp - accum0; - accum0 = tmp; - - accum0 += accumC0; - accum1 += accumC1; - accum2 += accum0 >> 28; - accum3 += accum1 >> 28; - - c[2] = ((uint32_t)(accum0)) & mask; - c[3] = ((uint32_t)(accum2)) & mask; - c[10] = ((uint32_t)(accum1)) & mask; - c[11] = ((uint32_t)(accum3)) & mask; - - accumC0 = accum2 >> 28; - accumC1 = accum3 >> 28; - } - { - - /* t^3 terms */ - smull2(&accum1, ax = a[13], bx = a[15]); - smull2(&accum3, ax = a[14], bx); - smlal(&accum1, ax, ax); - - accum0 = accum1; - accum2 = accum3; - - /* t^2 terms */ - - smlal2(&accum2, ax = a[8], bx = a[13]); - smlal2(&accum0, ax, bx = a[12]); - smlal2(&accum2, ax = a[9], bx); - smlal2(&accum0, ax, bx = a[11]); - smlal2(&accum2, ax = a[10], bx); - smlal(&accum0, ax, ax); - - smlal2(&accum0, ax = a[5], bx = a[7]); - smlal2(&accum2, ax = a[6], bx); - smlal(&accum0, ax, ax); - - /* t terms */ - accum1 += accum0; - accum3 += accum2; - - smlal2(&accum3, ax = a[0], bx = a[5]); - smlal2(&accum1, ax, bx = a[4]); - smlal2(&accum3, ax = a[1], bx); - smlal2(&accum1, ax, bx = a[3]); - smlal2(&accum3, ax = a[2], bx); - smlal(&accum1, ax, ax); - - accum1 = -accum1; - accum3 = -accum3; - accum2 = -accum2; - accum0 = -accum0; - - smlal2(&accum1, ax = bm[5], bx = bm[7]); - smlal2(&accum3, ax = bm[6], bx); - smlal(&accum1, ax, ax); - - /* 1 terms */ - - smlal2(&accum2, ax = bm[0], bx = bm[5]); - smlal2(&accum0, ax, bx = bm[4]); - smlal2(&accum2, ax = bm[1], bx); - smlal2(&accum0, ax, bx = bm[3]); - smlal2(&accum2, ax = bm[2], bx); - smlal(&accum0, ax, ax); - - tmp = -accum3; - accum3 = tmp - accum2; - accum2 = tmp; - tmp = -accum1; - accum1 = tmp - accum0; - accum0 = tmp; - - accum0 += accumC0; - accum1 += accumC1; - accum2 += accum0 >> 28; - accum3 += accum1 >> 28; - - c[4] = ((uint32_t)(accum0)) & mask; - c[5] = ((uint32_t)(accum2)) & mask; - c[12] = ((uint32_t)(accum1)) & mask; - c[13] = ((uint32_t)(accum3)) & mask; - - accumC0 = accum2 >> 28; - accumC1 = accum3 >> 28; - } - { - - /* t^3 terms */ - smull(&accum1, ax = a[15], bx = a[15]); - accum0 = accum1; - - /* t^2 terms */ - - smull2(&accum2, ax = a[8], bx); - smlal2(&accum0, ax, bx = a[14]); - smlal2(&accum2, ax = a[9], bx); - smlal2(&accum0, ax, bx = a[13]); - smlal2(&accum2, ax = a[10], bx); - smlal2(&accum0, ax, bx = a[12]); - smlal2(&accum2, ax = a[11], bx); - smlal(&accum0, ax, ax); - - smlal(&accum0, ax = a[7], bx = a[7]); - - /* t terms */ - accum1 += accum0; - accum3 = accum2; - - smlal2(&accum3, ax = a[0], bx); - smlal2(&accum1, ax, bx = a[6]); - smlal2(&accum3, ax = a[1], bx); - smlal2(&accum1, ax, bx = a[5]); - smlal2(&accum3, ax = a[2], bx); - smlal2(&accum1, ax, bx = a[4]); - smlal2(&accum3, ax = a[3], bx); - smlal(&accum1, ax, ax); - - accum1 = -accum1; - accum3 = -accum3; - accum2 = -accum2; - accum0 = -accum0; - - bx = bm[7]; - smlal(&accum1, bx, bx); - - /* 1 terms */ - - smlal2(&accum2, ax = bm[0], bx); - smlal2(&accum0, ax, bx = bm[6]); - smlal2(&accum2, ax = bm[1], bx); - smlal2(&accum0, ax, bx = bm[5]); - smlal2(&accum2, ax = bm[2], bx); - smlal2(&accum0, ax, bx = bm[4]); - smlal2(&accum2, ax = bm[3], bx); - smlal(&accum0, ax, ax); - - tmp = -accum3; - accum3 = tmp - accum2; - accum2 = tmp; - tmp = -accum1; - accum1 = tmp - accum0; - accum0 = tmp; - - accum0 += accumC0; - accum1 += accumC1; - accum2 += accum0 >> 28; - accum3 += accum1 >> 28; - - c[6] = ((uint32_t)(accum0)) & mask; - c[7] = ((uint32_t)(accum2)) & mask; - c[14] = ((uint32_t)(accum1)) & mask; - c[15] = ((uint32_t)(accum3)) & mask; - - accum0 = accum2 >> 28; - accum1 = accum3 >> 28; - } - - accum0 += accum1; - accum0 += c[8]; - accum1 += c[0]; - c[8] = ((uint32_t)(accum0)) & mask; - c[0] = ((uint32_t)(accum1)) & mask; - - accum0 >>= 28; - accum1 >>= 28; - c[9] += ((uint32_t)(accum0)); - c[1] += ((uint32_t)(accum1)); -} - -void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b) -{ - uint32_t mask = (1ull << 28) - 1; - const uint32_t *a = as->limb; - uint32_t *c = cs->limb; - uint64_t accum0, accum8; - int i; - uint32_t c0, c8, n0, n8; - - assert(b <= mask); - - c0 = a[0]; - c8 = a[8]; - accum0 = widemul(b, c0); - accum8 = widemul(b, c8); - - c[0] = accum0 & mask; - accum0 >>= 28; - c[8] = accum8 & mask; - accum8 >>= 28; - - i = 1; - { - n0 = a[i]; - n8 = a[i + 8]; - smlal(&accum0, b, n0); - smlal(&accum8, b, n8); - - c[i] = accum0 & mask; - accum0 >>= 28; - c[i + 8] = accum8 & mask; - accum8 >>= 28; - i++; - } - { - c0 = a[i]; - c8 = a[i + 8]; - smlal(&accum0, b, c0); - smlal(&accum8, b, c8); - - c[i] = accum0 & mask; - accum0 >>= 28; - c[i + 8] = accum8 & mask; - accum8 >>= 28; - i++; - } - { - n0 = a[i]; - n8 = a[i + 8]; - smlal(&accum0, b, n0); - smlal(&accum8, b, n8); - - c[i] = accum0 & mask; - accum0 >>= 28; - c[i + 8] = accum8 & mask; - accum8 >>= 28; - i++; - } - { - c0 = a[i]; - c8 = a[i + 8]; - smlal(&accum0, b, c0); - smlal(&accum8, b, c8); - - c[i] = accum0 & mask; - accum0 >>= 28; - c[i + 8] = accum8 & mask; - accum8 >>= 28; - i++; - } - { - n0 = a[i]; - n8 = a[i + 8]; - smlal(&accum0, b, n0); - smlal(&accum8, b, n8); - - c[i] = accum0 & mask; - accum0 >>= 28; - c[i + 8] = accum8 & mask; - accum8 >>= 28; - i++; - } - { - c0 = a[i]; - c8 = a[i + 8]; - smlal(&accum0, b, c0); - smlal(&accum8, b, c8); - - c[i] = accum0 & mask; - accum0 >>= 28; - c[i + 8] = accum8 & mask; - accum8 >>= 28; - i++; - } - { - n0 = a[i]; - n8 = a[i + 8]; - smlal(&accum0, b, n0); - smlal(&accum8, b, n8); - - c[i] = accum0 & mask; - accum0 >>= 28; - c[i + 8] = accum8 & mask; - accum8 >>= 28; - i++; - } - - accum0 += accum8 + c[8]; - c[8] = accum0 & mask; - c[9] += accum0 >> 28; - - accum8 += c[0]; - c[0] = accum8 & mask; - c[1] += accum8 >> 28; -} diff --git a/crypto/ec/curve448/arch_arm_32/f_impl.h b/crypto/ec/curve448/arch_arm_32/f_impl.h deleted file mode 100644 index af90c74f692a5d2328deaa620fbd940923bc94e4..0000000000000000000000000000000000000000 --- a/crypto/ec/curve448/arch_arm_32/f_impl.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. - * Copyright 2014-2016 Cryptography Research, Inc. - * - * Licensed under the OpenSSL license (the "License"). You may not use - * this file except in compliance with the License. You can obtain a copy - * in the file LICENSE in the source distribution or at - * https://www.openssl.org/source/license.html - * - * Originally written by Mike Hamburg - */ - -#define GF_HEADROOM 2 -#define LIMB(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28 -#define FIELD_LITERAL(a,b,c,d,e,f,g,h) \ - {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e),LIMB(f),LIMB(g),LIMB(h)}} - -#define LIMB_PLACE_VALUE(i) 28 - -void gf_add_RAW(gf out, const gf a, const gf b) -{ - for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) { - ((uint32xn_t *) out)[i] = - ((const uint32xn_t *)a)[i] + ((const uint32xn_t *)b)[i]; - } -} - -void gf_sub_RAW(gf out, const gf a, const gf b) -{ - for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) { - ((uint32xn_t *) out)[i] = - ((const uint32xn_t *)a)[i] - ((const uint32xn_t *)b)[i]; - } -} - -void gf_bias(gf a, int amt) -{ - uint32_t co1 = ((1ull << 28) - 1) * amt, co2 = co1 - amt; - uint32x4_t lo = { co1, co1, co1, co1 }, hi = { - co2, co1, co1, co1}; - uint32x4_t *aa = (uint32x4_t *) a; - - aa[0] += lo; - aa[1] += lo; - aa[2] += hi; - aa[3] += lo; -} - -void gf_weak_reduce(gf a) -{ - uint64_t mask = (1ull << 28) - 1; - uint64_t tmp = a->limb[15] >> 28; - - a->limb[8] += tmp; - for (unsigned int i = 15; i > 0; i--) { - a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 28); - } - a->limb[0] = (a->limb[0] & mask) + tmp; -} diff --git a/crypto/ec/curve448/arch_neon/arch_intrinsics.h b/crypto/ec/curve448/arch_neon/arch_intrinsics.h deleted file mode 100644 index 17db42643325c4f4b5f0a9eebf04ccd28ecfd610..0000000000000000000000000000000000000000 --- a/crypto/ec/curve448/arch_neon/arch_intrinsics.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. - * Copyright 2016 Cryptography Research, Inc. - * - * Licensed under the OpenSSL license (the "License"). You may not use - * this file except in compliance with the License. You can obtain a copy - * in the file LICENSE in the source distribution or at - * https://www.openssl.org/source/license.html - * - * Originally written by Mike Hamburg - */ - -#ifndef __ARCH_NEON_ARCH_INTRINSICS_H__ -# define __ARCH_NEON_ARCH_INTRINSICS_H__ - -# define ARCH_WORD_BITS 32 - -static __inline__ __attribute((always_inline, unused)) -uint32_t word_is_zero(uint32_t a) -{ - uint32_t ret; - __asm__("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc"); - return ret; -} - -static __inline__ __attribute((always_inline, unused)) -uint64_t widemul(uint32_t a, uint32_t b) -{ - /* - * Could be UMULL, but it's hard to express to CC that the registers must - * be different - */ - return ((uint64_t)a) * b; -} - -#endif /* __ARCH_NEON_ARCH_INTRINSICS_H__ */ diff --git a/crypto/ec/curve448/arch_neon/f_impl.c b/crypto/ec/curve448/arch_neon/f_impl.c deleted file mode 100644 index 3b6e28adaf66f27228be633c65a6c2a8c20ec4ec..0000000000000000000000000000000000000000 --- a/crypto/ec/curve448/arch_neon/f_impl.c +++ /dev/null @@ -1,594 +0,0 @@ -/* - * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. - * Copyright 2014 Cryptography Research, Inc. - * - * Licensed under the OpenSSL license (the "License"). You may not use - * this file except in compliance with the License. You can obtain a copy - * in the file LICENSE in the source distribution or at - * https://www.openssl.org/source/license.html - * - * Originally written by Mike Hamburg - */ - -#include "field.h" - -static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline,unused)) -xx_vaddup_u64(uint64x2_t x) -{ - __asm__ ("vadd.s64 %f0, %e0" : "+w"(x)); - return x; -} - -static __inline__ int64x2_t __attribute__((gnu_inline,always_inline,unused)) -vrev128_s64(int64x2_t x) -{ - __asm__ ("vswp.s64 %e0, %f0" : "+w"(x)); - return x; -} - -static __inline__ uint64x2_t __attribute__((gnu_inline,always_inline)) -vrev128_u64(uint64x2_t x) -{ - __asm__ ("vswp.s64 %e0, %f0" : "+w"(x)); - return x; -} - -static inline void __attribute__((gnu_inline,always_inline,unused)) -smlal(uint64_t *acc, const uint32_t a, const uint32_t b) -{ - *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b; -} - -static inline void __attribute__((gnu_inline,always_inline,unused)) -smlal2(uint64_t *acc, const uint32_t a, const uint32_t b) -{ - *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2; -} - -static inline void __attribute__((gnu_inline,always_inline,unused)) -smull(uint64_t *acc, const uint32_t a, const uint32_t b) -{ - *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b; -} - -static inline void __attribute__((gnu_inline,always_inline,unused)) -smull2(uint64_t *acc, const uint32_t a, const uint32_t b) -{ - *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2; -} - -void gf_mul(gf_s *__restrict__ cs, const gf as, const gf bs) -{ - #define _bl0 "q0" - #define _bl0_0 "d0" - #define _bl0_1 "d1" - #define _bh0 "q1" - #define _bh0_0 "d2" - #define _bh0_1 "d3" - #define _bs0 "q2" - #define _bs0_0 "d4" - #define _bs0_1 "d5" - #define _bl2 "q3" - #define _bl2_0 "d6" - #define _bl2_1 "d7" - #define _bh2 "q4" - #define _bh2_0 "d8" - #define _bh2_1 "d9" - #define _bs2 "q5" - #define _bs2_0 "d10" - #define _bs2_1 "d11" - - #define _as0 "q6" - #define _as0_0 "d12" - #define _as0_1 "d13" - #define _as2 "q7" - #define _as2_0 "d14" - #define _as2_1 "d15" - #define _al0 "q8" - #define _al0_0 "d16" - #define _al0_1 "d17" - #define _ah0 "q9" - #define _ah0_0 "d18" - #define _ah0_1 "d19" - #define _al2 "q10" - #define _al2_0 "d20" - #define _al2_1 "d21" - #define _ah2 "q11" - #define _ah2_0 "d22" - #define _ah2_1 "d23" - - #define _a0a "q12" - #define _a0a_0 "d24" - #define _a0a_1 "d25" - #define _a0b "q13" - #define _a0b_0 "d26" - #define _a0b_1 "d27" - #define _a1a "q14" - #define _a1a_0 "d28" - #define _a1a_1 "d29" - #define _a1b "q15" - #define _a1b_0 "d30" - #define _a1b_1 "d31" - #define VMAC(op,result,a,b,n) #op" "result", "a", "b"[" #n "]\n\t" - #define VOP3(op,result,a,b) #op" "result", "a", "b"\n\t" - #define VOP2(op,result,a) #op" "result", "a"\n\t" - - int32x2_t *vc = (int32x2_t*) cs->limb; - - __asm__ __volatile__( - - "vld2.32 {"_al0_0","_al0_1","_ah0_0","_ah0_1"}, [%[a],:128]!" "\n\t" - VOP3(vadd.i32,_as0,_al0,_ah0) - - "vld2.32 {"_bl0_0","_bl0_1","_bh0_0","_bh0_1"}, [%[b],:128]!" "\n\t" - VOP3(vadd.i32,_bs0_1,_bl0_1,_bh0_1) - VOP3(vsub.i32,_bs0_0,_bl0_0,_bh0_0) - - "vld2.32 {"_bl2_0","_bl2_1","_bh2_0","_bh2_1"}, [%[b],:128]!" "\n\t" - VOP3(vadd.i32,_bs2,_bl2,_bh2) - - "vld2.32 {"_al2_0","_al2_1","_ah2_0","_ah2_1"}, [%[a],:128]!" "\n\t" - VOP3(vadd.i32,_as2,_al2,_ah2) - - VMAC(vmull.s32,_a0b,_as0_1,_bs2_1,0) - VMAC(vmlal.s32,_a0b,_as2_0,_bs2_0,0) - VMAC(vmlal.s32,_a0b,_as2_1,_bs0_1,0) - VMAC(vmlal.s32,_a0b,_as0_0,_bh0_0,0) - - VMAC(vmull.s32,_a1b,_as0_1,_bs2_1,1) - VMAC(vmlal.s32,_a1b,_as2_0,_bs2_0,1) - VMAC(vmlal.s32,_a1b,_as2_1,_bs0_1,1) - VMAC(vmlal.s32,_a1b,_as0_0,_bh0_0,1) - - VOP2(vmov,_a0a,_a0b) - VMAC(vmlal.s32,_a0a,_ah0_1,_bh2_1,0) - VMAC(vmlal.s32,_a0a,_ah2_0,_bh2_0,0) - VMAC(vmlal.s32,_a0a,_ah2_1,_bh0_1,0) - VMAC(vmlal.s32,_a0a,_ah0_0,_bl0_0,0) - - VMAC(vmlsl.s32,_a0b,_al0_1,_bl2_1,0) - VMAC(vmlsl.s32,_a0b,_al2_0,_bl2_0,0) - VMAC(vmlsl.s32,_a0b,_al2_1,_bl0_1,0) - VMAC(vmlal.s32,_a0b,_al0_0,_bs0_0,0) - - VOP2(vmov,_a1a,_a1b) - VMAC(vmlal.s32,_a1a,_ah0_1,_bh2_1,1) - VMAC(vmlal.s32,_a1a,_ah2_0,_bh2_0,1) - VMAC(vmlal.s32,_a1a,_ah2_1,_bh0_1,1) - VMAC(vmlal.s32,_a1a,_ah0_0,_bl0_0,1) - - VOP2(vswp,_a0b_1,_a0a_0) - - VMAC(vmlsl.s32,_a1b,_al0_1,_bl2_1,1) - VMAC(vmlsl.s32,_a1b,_al2_0,_bl2_0,1) - VMAC(vmlsl.s32,_a1b,_al2_1,_bl0_1,1) - VMAC(vmlal.s32,_a1b,_al0_0,_bs0_0,1) - - VOP3(vsra.u64,_a0a,_a0b,"#28") - VOP3(vsub.i32,_bs0_1,_bl0_1,_bh0_1) - VOP2(vmovn.i64,_a0b_0,_a0b) - - VOP2(vswp,_a1b_1,_a1a_0) - VOP3(vadd.i64,_a1b,_a0a,_a1b) - - - VMAC(vmull.s32,_a0a,_as2_0,_bs2_1,0) - VOP2(vmovn.i64,_a0b_1,_a1b) - VMAC(vmlal.s32,_a0a,_as2_1,_bs2_0,0) - VOP3(vsra.u64,_a1a,_a1b,"#28") - VMAC(vmlal.s32,_a0a,_as0_0,_bh0_1,0) - VOP2(vbic.i32,_a0b,"#0xf0000000") - VMAC(vmlal.s32,_a0a,_as0_1,_bh0_0,0) - "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t" - - VMAC(vmull.s32,_a1b,_as2_0,_bs2_1,1) - VMAC(vmlal.s32,_a1b,_as2_1,_bs2_0,1) - VMAC(vmlal.s32,_a1b,_as0_0,_bh0_1,1) - VMAC(vmlal.s32,_a1b,_as0_1,_bh0_0,1) - - VOP2(vmov,_a0b_1,_a0a_1) - VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0) - VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1) - VMAC(vmlal.s32,_a0a,_ah2_0,_bh2_1,0) - VMAC(vmlal.s32,_a0a,_ah2_1,_bh2_0,0) - VMAC(vmlal.s32,_a0a,_ah0_0,_bl0_1,0) - VMAC(vmlal.s32,_a0a,_ah0_1,_bl0_0,0) - - VMAC(vmlsl.s32,_a0b,_al2_0,_bl2_1,0) - VMAC(vmlsl.s32,_a0b,_al2_1,_bl2_0,0) - VMAC(vmlal.s32,_a0b,_al0_0,_bs0_1,0) - VMAC(vmlal.s32,_a0b,_al0_1,_bs0_0,0) - - VOP2(vmov,_a1a,_a1b) - VMAC(vmlal.s32,_a1a,_ah2_0,_bh2_1,1) - VMAC(vmlal.s32,_a1a,_ah2_1,_bh2_0,1) - VMAC(vmlal.s32,_a1a,_ah0_0,_bl0_1,1) - VMAC(vmlal.s32,_a1a,_ah0_1,_bl0_0,1) - - VOP2(vswp,_a0b_1,_a0a_0) - - VMAC(vmlsl.s32,_a1b,_al2_0,_bl2_1,1) - VMAC(vmlsl.s32,_a1b,_al2_1,_bl2_0,1) - VMAC(vmlal.s32,_a1b,_al0_0,_bs0_1,1) - VMAC(vmlal.s32,_a1b,_al0_1,_bs0_0,1) - - VOP3(vsra.u64,_a0a,_a0b,"#28") - VOP3(vsub.i32,_bs2_0,_bl2_0,_bh2_0) - VOP2(vmovn.i64,_a0b_0,_a0b) - - VOP2(vswp,_a1b_1,_a1a_0) - VOP3(vadd.i64,_a1b,_a0a,_a1b) - - VMAC(vmull.s32,_a0a,_as2_1,_bs2_1,0) - VOP2(vmovn.i64,_a0b_1,_a1b) - VMAC(vmlal.s32,_a0a,_as0_0,_bh2_0,0) - VOP3(vsra.u64,_a1a,_a1b,"#28") - VMAC(vmlal.s32,_a0a,_as0_1,_bh0_1,0) - VOP2(vbic.i32,_a0b,"#0xf0000000") - VMAC(vmlal.s32,_a0a,_as2_0,_bh0_0,0) - "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t" - - VMAC(vmull.s32,_a1b,_as2_1,_bs2_1,1) - VMAC(vmlal.s32,_a1b,_as0_0,_bh2_0,1) - VMAC(vmlal.s32,_a1b,_as0_1,_bh0_1,1) - VMAC(vmlal.s32,_a1b,_as2_0,_bh0_0,1) - - VOP2(vmov,_a0b_1,_a0a_1) - VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0) - VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1) - VMAC(vmlal.s32,_a0a,_ah2_1,_bh2_1,0) - VMAC(vmlal.s32,_a0a,_ah0_0,_bl2_0,0) - VMAC(vmlal.s32,_a0a,_ah0_1,_bl0_1,0) - VMAC(vmlal.s32,_a0a,_ah2_0,_bl0_0,0) - - VMAC(vmlsl.s32,_a0b,_al2_1,_bl2_1,0) - VMAC(vmlal.s32,_a0b,_al0_0,_bs2_0,0) - VMAC(vmlal.s32,_a0b,_al0_1,_bs0_1,0) - VMAC(vmlal.s32,_a0b,_al2_0,_bs0_0,0) - - VOP2(vmov,_a1a,_a1b) - VMAC(vmlal.s32,_a1a,_ah2_1,_bh2_1,1) - VMAC(vmlal.s32,_a1a,_ah0_0,_bl2_0,1) - VMAC(vmlal.s32,_a1a,_ah0_1,_bl0_1,1) - VMAC(vmlal.s32,_a1a,_ah2_0,_bl0_0,1) - - VOP2(vswp,_a0b_1,_a0a_0) - - VMAC(vmlsl.s32,_a1b,_al2_1,_bl2_1,1) - VMAC(vmlal.s32,_a1b,_al0_0,_bs2_0,1) - VMAC(vmlal.s32,_a1b,_al0_1,_bs0_1,1) - VMAC(vmlal.s32,_a1b,_al2_0,_bs0_0,1) - - VOP3(vsub.i32,_bs2_1,_bl2_1,_bh2_1) - VOP3(vsra.u64,_a0a,_a0b,"#28") - VOP2(vmovn.i64,_a0b_0,_a0b) - - VOP2(vswp,_a1b_1,_a1a_0) - VOP3(vadd.i64,_a1b,_a0a,_a1b) - - VMAC(vmull.s32,_a0a,_as0_0,_bh2_1,0) - VOP2(vmovn.i64,_a0b_1,_a1b) - VMAC(vmlal.s32,_a0a,_as0_1,_bh2_0,0) - VOP3(vsra.u64,_a1a,_a1b,"#28") - VMAC(vmlal.s32,_a0a,_as2_0,_bh0_1,0) - VOP2(vbic.i32,_a0b,"#0xf0000000") - VMAC(vmlal.s32,_a0a,_as2_1,_bh0_0,0) - "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t" - - VMAC(vmull.s32,_a1b,_as0_0,_bh2_1,1) - VMAC(vmlal.s32,_a1b,_as0_1,_bh2_0,1) - VMAC(vmlal.s32,_a1b,_as2_0,_bh0_1,1) - VMAC(vmlal.s32,_a1b,_as2_1,_bh0_0,1) - - VOP2(vmov,_a0b_1,_a0a_1) - VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0) - VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1) - VMAC(vmlal.s32,_a0a,_ah0_0,_bl2_1,0) - VMAC(vmlal.s32,_a0a,_ah0_1,_bl2_0,0) - VMAC(vmlal.s32,_a0a,_ah2_0,_bl0_1,0) - VMAC(vmlal.s32,_a0a,_ah2_1,_bl0_0,0) - - VMAC(vmlal.s32,_a0b,_al0_0,_bs2_1,0) - VMAC(vmlal.s32,_a0b,_al0_1,_bs2_0,0) - VMAC(vmlal.s32,_a0b,_al2_0,_bs0_1,0) - VMAC(vmlal.s32,_a0b,_al2_1,_bs0_0,0) - - VOP2(vmov,_a1a,_a1b) - VMAC(vmlal.s32,_a1a,_ah0_0,_bl2_1,1) - VMAC(vmlal.s32,_a1a,_ah0_1,_bl2_0,1) - VMAC(vmlal.s32,_a1a,_ah2_0,_bl0_1,1) - VMAC(vmlal.s32,_a1a,_ah2_1,_bl0_0,1) - - VOP2(vswp,_a0b_1,_a0a_0) - - VMAC(vmlal.s32,_a1b,_al0_0,_bs2_1,1) - VMAC(vmlal.s32,_a1b,_al0_1,_bs2_0,1) - VMAC(vmlal.s32,_a1b,_al2_0,_bs0_1,1) - VMAC(vmlal.s32,_a1b,_al2_1,_bs0_0,1) - - VOP3(vsra.u64,_a0a,_a0b,"#28") - VOP2(vmovn.i64,_a0b_0,_a0b) - - VOP2(vswp,_a1b_1,_a1a_0) - VOP3(vadd.i64,_a0a,_a0a,_a1b) - - VOP2(vmovn.i64,_a0b_1,_a0a) - VOP3(vsra.u64,_a1a,_a0a,"#28") - - VOP2(vbic.i32,_a0b,"#0xf0000000") - - VOP2(vswp,_a1a_0,_a1a_1) - - "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t" - "sub %[c], #64" "\n\t" - - VOP3(vadd.i64,_a1a_1,_a1a_1,_a1a_0) - - "vldmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t" - VOP2(vaddw.s32,_a1a,_a0a_0) - VOP2(vmovn.i64,_a0a_0,_a1a) - VOP2(vshr.s64,_a1a,"#28") - - VOP2(vaddw.s32,_a1a,_a0a_1) - VOP2(vmovn.i64,_a0a_1,_a1a) - VOP2(vshr.s64,_a1a,"#28") - - VOP2(vbic.i32,_a0a,"#0xf0000000") - - VOP2(vaddw.s32,_a1a,_a0b_0) - VOP2(vmovn.i64,_a0b_0,_a1a) - - "vstmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t" - - : [a]"+r"(as) - , [b]"+r"(bs) - , [c]"+r"(vc) - - :: "q0","q1","q2","q3", - "q4","q5","q6","q7", - "q8","q9","q10","q11", - "q12","q13","q14","q15", - "memory" - ); -} - -void gf_sqr(gf_s *__restrict__ cs, const gf bs) -{ - int32x2_t *vc = (int32x2_t*) cs->limb; - - __asm__ __volatile__ ( - "vld2.32 {"_bl0_0","_bl0_1","_bh0_0","_bh0_1"}, [%[b],:128]!" "\n\t" - VOP3(vadd.i32,_bs0_1,_bl0_1,_bh0_1) /* 0 .. 2^30 */ - VOP3(vsub.i32,_bs0_0,_bl0_0,_bh0_0) /* +- 2^29 */ - VOP3(vadd.i32,_as0,_bl0,_bh0) /* 0 .. 2^30 */ - - "vld2.32 {"_bl2_0","_bl2_1","_bh2_0","_bh2_1"}, [%[b],:128]!" "\n\t" - VOP3(vadd.i32,_bs2,_bl2,_bh2) /* 0 .. 2^30 */ - VOP2(vmov,_as2,_bs2) - - VMAC(vqdmull.s32,_a0b,_as0_1,_bs2_1,0) /* 0 .. 8 * 2^58. danger for vqdmlal is 32 */ - VMAC(vmlal.s32,_a0b,_as2_0,_bs2_0,0) /* 0 .. 12 */ - VMAC(vmlal.s32,_a0b,_as0_0,_bh0_0,0) /* 0 .. 14 */ - - VMAC(vqdmull.s32,_a1b,_as0_1,_bs2_1,1) /* 0 .. 8 */ - VMAC(vmlal.s32,_a1b,_as2_0,_bs2_0,1) /* 0 .. 14 */ - VMAC(vmlal.s32,_a1b,_as0_0,_bh0_0,1) /* 0 .. 16 */ - - VOP2(vmov,_a0a,_a0b) /* 0 .. 14 */ - VMAC(vqdmlal.s32,_a0a,_bh0_1,_bh2_1,0) /* 0 .. 16 */ - VMAC(vmlal.s32,_a0a,_bh2_0,_bh2_0,0) /* 0 .. 17 */ - VMAC(vmlal.s32,_a0a,_bh0_0,_bl0_0,0) /* 0 .. 18 */ - - VMAC(vqdmlsl.s32,_a0b,_bl0_1,_bl2_1,0) /*-2 .. 14 */ - VMAC(vmlsl.s32,_a0b,_bl2_0,_bl2_0,0) /*-3 .. 14 */ - VMAC(vmlal.s32,_a0b,_bl0_0,_bs0_0,0) /*-4 .. 15 */ - - VOP2(vmov,_a1a,_a1b) - VMAC(vqdmlal.s32,_a1a,_bh0_1,_bh2_1,1) /* 0 .. 18 */ - VMAC(vmlal.s32,_a1a,_bh2_0,_bh2_0,1) /* 0 .. 19 */ - VMAC(vmlal.s32,_a1a,_bh0_0,_bl0_0,1) /* 0 .. 20 */ - - VOP2(vswp,_a0b_1,_a0a_0) - - VMAC(vqdmlsl.s32,_a1b,_bl0_1,_bl2_1,1) /*-2 .. 16 */ - VMAC(vmlsl.s32,_a1b,_bl2_0,_bl2_0,1) /*-3 .. 16 */ - VMAC(vmlal.s32,_a1b,_bl0_0,_bs0_0,1) /*-4 .. 17 */ - - VOP3(vsra.u64,_a0a,_a0b,"#28") - VOP3(vsub.i32,_bs0_1,_bl0_1,_bh0_1) - VOP2(vmovn.i64,_a0b_0,_a0b) - - VOP2(vswp,_a1b_1,_a1a_0) - VOP3(vadd.i64,_a1b,_a0a,_a1b) - - - VMAC(vqdmull.s32,_a0a,_as2_0,_bs2_1,0) /* 0 .. 8 */ - VOP2(vmovn.i64,_a0b_1,_a1b) - VOP3(vsra.u64,_a1a,_a1b,"#28") - VMAC(vqdmlal.s32,_a0a,_as0_0,_bh0_1,0) /* 0 .. 12 */ - VOP2(vbic.i32,_a0b,"#0xf0000000") - "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t" - - VMAC(vqdmull.s32,_a1b,_as2_0,_bs2_1,1) /* 0 .. 8 */ - VMAC(vqdmlal.s32,_a1b,_as0_0,_bh0_1,1) /* 0 .. 12 */ - - VOP2(vmov,_a0b,_a0a) /* 0 .. 12 */ - VMAC(vqdmlal.s32,_a0a,_bh2_0,_bh2_1,0) /* 0 .. 14 */ - VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl0_1,0) /* 0 .. 16 */ - - VMAC(vqdmlsl.s32,_a0b,_bl2_0,_bl2_1,0) /*-2 .. 12 */ - VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs0_1,0) /*-4 .. 14 */ - VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1) - VOP3(vadd.i64,_a0b_0,_a0b_0,_a1a_0) - - VOP2(vmov,_a1a,_a1b) /* 0 .. 12 */ - VMAC(vqdmlal.s32,_a1a,_bh2_0,_bh2_1,1) /* 0 .. 14 */ - VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl0_1,1) /* 0 .. 16 */ - - VOP2(vswp,_a0b_1,_a0a_0) - - VMAC(vqdmlsl.s32,_a1b,_bl2_0,_bl2_1,1) /*-2 .. 12 */ - VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs0_1,1) /*-4 .. 14 */ - - VOP3(vsra.u64,_a0a,_a0b,"#28") - VOP3(vsub.i32,_bs2_0,_bl2_0,_bh2_0) - VOP2(vmovn.i64,_a0b_0,_a0b) - - VOP2(vswp,_a1b_1,_a1a_0) - VOP3(vadd.i64,_a1b,_a0a,_a1b) - - VMAC(vmull.s32,_a0a,_as2_1,_bs2_1,0) - VOP2(vmovn.i64,_a0b_1,_a1b) - VMAC(vqdmlal.s32,_a0a,_as0_0,_bh2_0,0) - VOP3(vsra.u64,_a1a,_a1b,"#28") - VMAC(vmlal.s32,_a0a,_as0_1,_bh0_1,0) - VOP2(vbic.i32,_a0b,"#0xf0000000") - "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t" - - VMAC(vmull.s32,_a1b,_as2_1,_bs2_1,1) - VMAC(vqdmlal.s32,_a1b,_as0_0,_bh2_0,1) - VMAC(vmlal.s32,_a1b,_as0_1,_bh0_1,1) - - VOP2(vmov,_a0b_1,_a0a_1) - VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0) - VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1) - VMAC(vmlal.s32,_a0a,_bh2_1,_bh2_1,0) - VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl2_0,0) - VMAC(vmlal.s32,_a0a,_bh0_1,_bl0_1,0) - - VMAC(vmlsl.s32,_a0b,_bl2_1,_bl2_1,0) - VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs2_0,0) - VMAC(vmlal.s32,_a0b,_bl0_1,_bs0_1,0) - - VOP2(vmov,_a1a,_a1b) - VMAC(vmlal.s32,_a1a,_bh2_1,_bh2_1,1) - VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl2_0,1) - VMAC(vmlal.s32,_a1a,_bh0_1,_bl0_1,1) - - VOP2(vswp,_a0b_1,_a0a_0) - - VMAC(vmlsl.s32,_a1b,_bl2_1,_bl2_1,1) - VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs2_0,1) - VMAC(vmlal.s32,_a1b,_bl0_1,_bs0_1,1) - - VOP3(vsub.i32,_bs2_1,_bl2_1,_bh2_1) - VOP3(vsra.u64,_a0a,_a0b,"#28") - VOP2(vmovn.i64,_a0b_0,_a0b) - - VOP2(vswp,_a1b_1,_a1a_0) - VOP3(vadd.i64,_a1b,_a0a,_a1b) - - VMAC(vqdmull.s32,_a0a,_as0_0,_bh2_1,0) - VOP2(vmovn.i64,_a0b_1,_a1b) - VOP3(vsra.u64,_a1a,_a1b,"#28") - VMAC(vqdmlal.s32,_a0a,_as2_0,_bh0_1,0) - VOP2(vbic.i32,_a0b,"#0xf0000000") - "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t" - - VMAC(vqdmull.s32,_a1b,_as0_0,_bh2_1,1) - VMAC(vqdmlal.s32,_a1b,_as2_0,_bh0_1,1) - - VOP2(vmov,_a0b_1,_a0a_1) - VOP3(vadd.i64,_a0b_0,_a0a_0,_a1a_0) - VOP3(vadd.i64,_a0a_0,_a0a_0,_a1a_1) - VMAC(vqdmlal.s32,_a0a,_bh0_0,_bl2_1,0) - VMAC(vqdmlal.s32,_a0a,_bh2_0,_bl0_1,0) - - VMAC(vqdmlal.s32,_a0b,_bl0_0,_bs2_1,0) - VMAC(vqdmlal.s32,_a0b,_bl2_0,_bs0_1,0) - - VOP2(vmov,_a1a,_a1b) - VMAC(vqdmlal.s32,_a1a,_bh0_0,_bl2_1,1) - VMAC(vqdmlal.s32,_a1a,_bh2_0,_bl0_1,1) - - VOP2(vswp,_a0b_1,_a0a_0) - - VMAC(vqdmlal.s32,_a1b,_bl0_0,_bs2_1,1) - VMAC(vqdmlal.s32,_a1b,_bl2_0,_bs0_1,1) - - VOP3(vsra.u64,_a0a,_a0b,"#28") - VOP2(vmovn.i64,_a0b_0,_a0b) - - VOP2(vswp,_a1b_1,_a1a_0) - VOP3(vadd.i64,_a0a,_a0a,_a1b) - - VOP2(vmovn.i64,_a0b_1,_a0a) - VOP3(vsra.u64,_a1a,_a0a,"#28") - - VOP2(vbic.i32,_a0b,"#0xf0000000") - - VOP2(vswp,_a1a_0,_a1a_1) - - "vstmia %[c]!, {"_a0b_0", "_a0b_1"}" "\n\t" - "sub %[c], #64" "\n\t" - - VOP3(vadd.i64,_a1a_1,_a1a_1,_a1a_0) - - "vldmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t" - VOP2(vaddw.s32,_a1a,_a0a_0) - VOP2(vmovn.i64,_a0a_0,_a1a) - VOP2(vshr.s64,_a1a,"#28") - - VOP2(vaddw.s32,_a1a,_a0a_1) - VOP2(vmovn.i64,_a0a_1,_a1a) - VOP2(vshr.s64,_a1a,"#28") - - VOP2(vbic.i32,_a0a,"#0xf0000000") - - VOP2(vaddw.s32,_a1a,_a0b_0) - VOP2(vmovn.i64,_a0b_0,_a1a) - - "vstmia %[c], {"_a0a_0", "_a0a_1", "_a0b_0"}" "\n\t" - - : [b]"+r"(bs) - , [c]"+r"(vc) - - :: "q0","q1","q2","q3", - "q4","q5","q6","q7", - "q12","q13","q14","q15", - "memory" - ); -} - -void gf_mulw_unsigned(gf_s *__restrict__ cs, const gf as, uint32_t b) -{ - uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1}; - assert(b<(1<<28)); - - uint64x2_t accum; - const uint32x2_t *va = (const uint32x2_t *) as->limb; - uint32x2_t *vo = (uint32x2_t *) cs->limb; - uint32x2_t vc, vn; - uint32x2_t vb = {b, 0}; - - vc = va[0]; - accum = vmull_lane_u32(vc, vb, 0); - vo[0] = vmovn_u64(accum) & vmask; - accum = vshrq_n_u64(accum,28); - - /* PERF: the right way to do this is to reduce behind, i.e. - * vmull + vmlal round 0 - * vmull + vmlal round 1 - * vmull + vmlal round 2 - * vsraq round 0, 1 - * vmull + vmlal round 3 - * vsraq round 1, 2 - * ... - */ - - int i; - for (i=1; i<8; i++) { - vn = va[i]; - accum = vmlal_lane_u32(accum, vn, vb, 0); - vo[i] = vmovn_u64(accum) & vmask; - accum = vshrq_n_u64(accum,28); - vc = vn; - } - - accum = xx_vaddup_u64(vrev128_u64(accum)); - accum = vaddw_u32(accum, vo[0]); - vo[0] = vmovn_u64(accum) & vmask; - - accum = vshrq_n_u64(accum,28); - vo[1] += vmovn_u64(accum); -} diff --git a/crypto/ec/curve448/arch_neon/f_impl.h b/crypto/ec/curve448/arch_neon/f_impl.h deleted file mode 100644 index 58c9dd045f29e73bb2b17d1334b31c774ae29359..0000000000000000000000000000000000000000 --- a/crypto/ec/curve448/arch_neon/f_impl.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. - * Copyright 2014-2016 Cryptography Research, Inc. - * - * Licensed under the OpenSSL license (the "License"). You may not use - * this file except in compliance with the License. You can obtain a copy - * in the file LICENSE in the source distribution or at - * https://www.openssl.org/source/license.html - * - * Originally written by Mike Hamburg - */ - -#define GF_HEADROOM 2 -#define LIMBPERM(x) (((x)<<1 | (x)>>3) & 15) -#define USE_NEON_PERM 1 -#define LIMBHI(x) ((x##ull)>>28) -#define LIMBLO(x) ((x##ull)&((1ull<<28)-1)) -#define FIELD_LITERAL(a,b,c,d,e,f,g,h) { \ - { \ - LIMBLO(a), LIMBLO(e), LIMBHI(a), LIMBHI(e), LIMBLO(b), LIMBLO(f), \ - LIMBHI(b), LIMBHI(f), LIMBLO(c), LIMBLO(g), LIMBHI(c), LIMBHI(g), \ - LIMBLO(d), LIMBLO(h), LIMBHI(d), LIMBHI(h) \ - } \ -} - -#define LIMB_PLACE_VALUE(i) 28 - -void gf_add_RAW(gf out, const gf a, const gf b) -{ - for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) { - ((uint32xn_t *) out)[i] = - ((const uint32xn_t *)a)[i] + ((const uint32xn_t *)b)[i]; - } -} - -void gf_sub_RAW(gf out, const gf a, const gf b) -{ - for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) { - ((uint32xn_t *) out)[i] = - ((const uint32xn_t *)a)[i] - ((const uint32xn_t *)b)[i]; - } -} - -void gf_bias(gf a, int amt) -{ - uint32_t co1 = ((1ull << 28) - 1) * amt, co2 = co1 - amt; - uint32x4_t lo = { co1, co2, co1, co1 }; - uint32x4_t hi = { co1, co1, co1, co1 }; - uint32x4_t *aa = (uint32x4_t *) a; - aa[0] += lo; - aa[1] += hi; - aa[2] += hi; - aa[3] += hi; -} - -void gf_weak_reduce(gf a) -{ - uint32x2_t *aa = (uint32x2_t *) a; - uint32x2_t vmask = { (1ull << 28) - 1, (1ull << 28) - 1}; - uint32x2_t vm2 = { 0, -1}, tmp = vshr_n_u32(aa[7], 28); - - for (unsigned int i = 7; i >= 1; i--) - aa[i] = vsra_n_u32(aa[i] & vmask, aa[i - 1], 28); - aa[0] = (aa[0] & vmask) + vrev64_u32(tmp) + (tmp & vm2); -} diff --git a/crypto/ec/curve448/arch_ref64/arch_intrinsics.h b/crypto/ec/curve448/arch_ref64/arch_intrinsics.h deleted file mode 100644 index 650b63897deed40d3bf649b0a432df5b064dab6a..0000000000000000000000000000000000000000 --- a/crypto/ec/curve448/arch_ref64/arch_intrinsics.h +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. - * Copyright 2016 Cryptography Research, Inc. - * - * Licensed under the OpenSSL license (the "License"). You may not use - * this file except in compliance with the License. You can obtain a copy - * in the file LICENSE in the source distribution or at - * https://www.openssl.org/source/license.html - * - * Originally written by Mike Hamburg - */ - -#ifndef __ARCH_REF64_ARCH_INTRINSICS_H__ -# define __ARCH_REF64_ARCH_INTRINSICS_H__ - -# define ARCH_WORD_BITS 64 - -static __inline__ __attribute((always_inline, unused)) -uint64_t word_is_zero(uint64_t a) -{ - /* let's hope the compiler isn't clever enough to optimize this. */ - return (((__uint128_t) a) - 1) >> 64; -} - -static __inline__ __attribute((always_inline, unused)) -__uint128_t widemul(uint64_t a, uint64_t b) -{ - return ((__uint128_t) a) * b; -} - -#endif /* ARCH_REF64_ARCH_INTRINSICS_H__ */ diff --git a/crypto/ec/curve448/arch_ref64/f_impl.c b/crypto/ec/curve448/arch_ref64/f_impl.c deleted file mode 100644 index 7cb5749cdaa304655d79b2eebb403779128a9447..0000000000000000000000000000000000000000 --- a/crypto/ec/curve448/arch_ref64/f_impl.c +++ /dev/null @@ -1,308 +0,0 @@ -/* - * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. - * Copyright 2014 Cryptography Research, Inc. - * - * Licensed under the OpenSSL license (the "License"). You may not use - * this file except in compliance with the License. You can obtain a copy - * in the file LICENSE in the source distribution or at - * https://www.openssl.org/source/license.html - * - * Originally written by Mike Hamburg - */ -#include "field.h" - -void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs) -{ - const uint64_t *a = as->limb, *b = bs->limb; - uint64_t *c = cs->limb; - __uint128_t accum0 = 0, accum1 = 0, accum2; - uint64_t mask = (1ull << 56) - 1; - uint64_t aa[4], bb[4], bbb[4]; - unsigned int i; - - for (i = 0; i < 4; i++) { - aa[i] = a[i] + a[i + 4]; - bb[i] = b[i] + b[i + 4]; - bbb[i] = bb[i] + b[i + 4]; - } - - int I_HATE_UNROLLED_LOOPS = 0; - - if (I_HATE_UNROLLED_LOOPS) { - /* - * The compiler probably won't unroll this, so it's like 80% slower. - */ - for (i = 0; i < 4; i++) { - accum2 = 0; - - unsigned int j; - for (j = 0; j <= i; j++) { - accum2 += widemul(a[j], b[i - j]); - accum1 += widemul(aa[j], bb[i - j]); - accum0 += widemul(a[j + 4], b[i - j + 4]); - } - for (; j < 4; j++) { - accum2 += widemul(a[j], b[i - j + 8]); - accum1 += widemul(aa[j], bbb[i - j + 4]); - accum0 += widemul(a[j + 4], bb[i - j + 4]); - } - - accum1 -= accum2; - accum0 += accum2; - - c[i] = ((uint64_t)(accum0)) & mask; - c[i + 4] = ((uint64_t)(accum1)) & mask; - - accum0 >>= 56; - accum1 >>= 56; - } - } else { - accum2 = widemul(a[0], b[0]); - accum1 += widemul(aa[0], bb[0]); - accum0 += widemul(a[4], b[4]); - - accum2 += widemul(a[1], b[7]); - accum1 += widemul(aa[1], bbb[3]); - accum0 += widemul(a[5], bb[3]); - - accum2 += widemul(a[2], b[6]); - accum1 += widemul(aa[2], bbb[2]); - accum0 += widemul(a[6], bb[2]); - - accum2 += widemul(a[3], b[5]); - accum1 += widemul(aa[3], bbb[1]); - accum0 += widemul(a[7], bb[1]); - - accum1 -= accum2; - accum0 += accum2; - - c[0] = ((uint64_t)(accum0)) & mask; - c[4] = ((uint64_t)(accum1)) & mask; - - accum0 >>= 56; - accum1 >>= 56; - - accum2 = widemul(a[0], b[1]); - accum1 += widemul(aa[0], bb[1]); - accum0 += widemul(a[4], b[5]); - - accum2 += widemul(a[1], b[0]); - accum1 += widemul(aa[1], bb[0]); - accum0 += widemul(a[5], b[4]); - - accum2 += widemul(a[2], b[7]); - accum1 += widemul(aa[2], bbb[3]); - accum0 += widemul(a[6], bb[3]); - - accum2 += widemul(a[3], b[6]); - accum1 += widemul(aa[3], bbb[2]); - accum0 += widemul(a[7], bb[2]); - - accum1 -= accum2; - accum0 += accum2; - - c[1] = ((uint64_t)(accum0)) & mask; - c[5] = ((uint64_t)(accum1)) & mask; - - accum0 >>= 56; - accum1 >>= 56; - - accum2 = widemul(a[0], b[2]); - accum1 += widemul(aa[0], bb[2]); - accum0 += widemul(a[4], b[6]); - - accum2 += widemul(a[1], b[1]); - accum1 += widemul(aa[1], bb[1]); - accum0 += widemul(a[5], b[5]); - - accum2 += widemul(a[2], b[0]); - accum1 += widemul(aa[2], bb[0]); - accum0 += widemul(a[6], b[4]); - - accum2 += widemul(a[3], b[7]); - accum1 += widemul(aa[3], bbb[3]); - accum0 += widemul(a[7], bb[3]); - - accum1 -= accum2; - accum0 += accum2; - - c[2] = ((uint64_t)(accum0)) & mask; - c[6] = ((uint64_t)(accum1)) & mask; - - accum0 >>= 56; - accum1 >>= 56; - - accum2 = widemul(a[0], b[3]); - accum1 += widemul(aa[0], bb[3]); - accum0 += widemul(a[4], b[7]); - - accum2 += widemul(a[1], b[2]); - accum1 += widemul(aa[1], bb[2]); - accum0 += widemul(a[5], b[6]); - - accum2 += widemul(a[2], b[1]); - accum1 += widemul(aa[2], bb[1]); - accum0 += widemul(a[6], b[5]); - - accum2 += widemul(a[3], b[0]); - accum1 += widemul(aa[3], bb[0]); - accum0 += widemul(a[7], b[4]); - - accum1 -= accum2; - accum0 += accum2; - - c[3] = ((uint64_t)(accum0)) & mask; - c[7] = ((uint64_t)(accum1)) & mask; - - accum0 >>= 56; - accum1 >>= 56; - } /* !I_HATE_UNROLLED_LOOPS */ - - accum0 += accum1; - accum0 += c[4]; - accum1 += c[0]; - c[4] = ((uint64_t)(accum0)) & mask; - c[0] = ((uint64_t)(accum1)) & mask; - - accum0 >>= 56; - accum1 >>= 56; - - c[5] += ((uint64_t)(accum0)); - c[1] += ((uint64_t)(accum1)); -} - -void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b) -{ - const uint64_t *a = as->limb; - uint64_t *c = cs->limb; - __uint128_t accum0 = 0, accum4 = 0; - uint64_t mask = (1ull << 56) - 1; - int i; - - for (i = 0; i < 4; i++) { - accum0 += widemul(b, a[i]); - accum4 += widemul(b, a[i + 4]); - c[i] = accum0 & mask; - accum0 >>= 56; - c[i + 4] = accum4 & mask; - accum4 >>= 56; - } - - accum0 += accum4 + c[4]; - c[4] = accum0 & mask; - c[5] += accum0 >> 56; - - accum4 += c[0]; - c[0] = accum4 & mask; - c[1] += accum4 >> 56; -} - -void gf_sqr(gf_s * __restrict__ cs, const gf as) -{ - const uint64_t *a = as->limb; - uint64_t *c = cs->limb; - __uint128_t accum0 = 0, accum1 = 0, accum2; - uint64_t mask = (1ull << 56) - 1; - uint64_t aa[4]; - - /* For some reason clang doesn't vectorize this without prompting? */ - unsigned int i; - for (i = 0; i < 4; i++) { - aa[i] = a[i] + a[i + 4]; - } - - accum2 = widemul(a[0], a[3]); - accum0 = widemul(aa[0], aa[3]); - accum1 = widemul(a[4], a[7]); - - accum2 += widemul(a[1], a[2]); - accum0 += widemul(aa[1], aa[2]); - accum1 += widemul(a[5], a[6]); - - accum0 -= accum2; - accum1 += accum2; - - c[3] = ((uint64_t)(accum1)) << 1 & mask; - c[7] = ((uint64_t)(accum0)) << 1 & mask; - - accum0 >>= 55; - accum1 >>= 55; - - accum0 += widemul(2 * aa[1], aa[3]); - accum1 += widemul(2 * a[5], a[7]); - accum0 += widemul(aa[2], aa[2]); - accum1 += accum0; - - accum0 -= widemul(2 * a[1], a[3]); - accum1 += widemul(a[6], a[6]); - - accum2 = widemul(a[0], a[0]); - accum1 -= accum2; - accum0 += accum2; - - accum0 -= widemul(a[2], a[2]); - accum1 += widemul(aa[0], aa[0]); - accum0 += widemul(a[4], a[4]); - - c[0] = ((uint64_t)(accum0)) & mask; - c[4] = ((uint64_t)(accum1)) & mask; - - accum0 >>= 56; - accum1 >>= 56; - - accum2 = widemul(2 * aa[2], aa[3]); - accum0 -= widemul(2 * a[2], a[3]); - accum1 += widemul(2 * a[6], a[7]); - - accum1 += accum2; - accum0 += accum2; - - accum2 = widemul(2 * a[0], a[1]); - accum1 += widemul(2 * aa[0], aa[1]); - accum0 += widemul(2 * a[4], a[5]); - - accum1 -= accum2; - accum0 += accum2; - - c[1] = ((uint64_t)(accum0)) & mask; - c[5] = ((uint64_t)(accum1)) & mask; - - accum0 >>= 56; - accum1 >>= 56; - - accum2 = widemul(aa[3], aa[3]); - accum0 -= widemul(a[3], a[3]); - accum1 += widemul(a[7], a[7]); - - accum1 += accum2; - accum0 += accum2; - - accum2 = widemul(2 * a[0], a[2]); - accum1 += widemul(2 * aa[0], aa[2]); - accum0 += widemul(2 * a[4], a[6]); - - accum2 += widemul(a[1], a[1]); - accum1 += widemul(aa[1], aa[1]); - accum0 += widemul(a[5], a[5]); - - accum1 -= accum2; - accum0 += accum2; - - c[2] = ((uint64_t)(accum0)) & mask; - c[6] = ((uint64_t)(accum1)) & mask; - - accum0 >>= 56; - accum1 >>= 56; - - accum0 += c[3]; - accum1 += c[7]; - c[3] = ((uint64_t)(accum0)) & mask; - c[7] = ((uint64_t)(accum1)) & mask; - - /* we could almost stop here, but it wouldn't be stable, so... */ - - accum0 >>= 56; - accum1 >>= 56; - c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1)); - c[0] += ((uint64_t)(accum1)); -} diff --git a/crypto/ec/curve448/arch_ref64/f_impl.h b/crypto/ec/curve448/arch_ref64/f_impl.h deleted file mode 100644 index 8751ceecd9f8850adb0fe8f5d45a2a9908e700db..0000000000000000000000000000000000000000 --- a/crypto/ec/curve448/arch_ref64/f_impl.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. - * Copyright 2014-2016 Cryptography Research, Inc. - * - * Licensed under the OpenSSL license (the "License"). You may not use - * this file except in compliance with the License. You can obtain a copy - * in the file LICENSE in the source distribution or at - * https://www.openssl.org/source/license.html - * - * Originally written by Mike Hamburg - */ - -#define GF_HEADROOM 9999 /* Everything is reduced anyway */ -#define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}} - -#define LIMB_PLACE_VALUE(i) 56 - -void gf_add_RAW(gf out, const gf a, const gf b) -{ - for (unsigned int i = 0; i < 8; i++) - out->limb[i] = a->limb[i] + b->limb[i]; - gf_weak_reduce(out); -} - -void gf_sub_RAW(gf out, const gf a, const gf b) -{ - uint64_t co1 = ((1ull << 56) - 1) * 2, co2 = co1 - 2; - - for (unsigned int i = 0; i < 8; i++) - out->limb[i] = a->limb[i] - b->limb[i] + ((i == 4) ? co2 : co1); - gf_weak_reduce(out); -} - -void gf_bias(gf a, int amt) -{ - (void)a; - (void)amt; -} - -void gf_weak_reduce(gf a) -{ - uint64_t mask = (1ull << 56) - 1; - uint64_t tmp = a->limb[7] >> 56; - - a->limb[4] += tmp; - for (unsigned int i = 7; i > 0; i--) - a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 56); - a->limb[0] = (a->limb[0] & mask) + tmp; -} diff --git a/crypto/ec/curve448/arch_x86_64/arch_intrinsics.h b/crypto/ec/curve448/arch_x86_64/arch_intrinsics.h deleted file mode 100644 index c31b64e4b59d5fff736e7142f525a073c142ab83..0000000000000000000000000000000000000000 --- a/crypto/ec/curve448/arch_x86_64/arch_intrinsics.h +++ /dev/null @@ -1,338 +0,0 @@ -/* - * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. - * Copyright 2014-2016 Cryptography Research, Inc. - * - * Licensed under the OpenSSL license (the "License"). You may not use - * this file except in compliance with the License. You can obtain a copy - * in the file LICENSE in the source distribution or at - * https://www.openssl.org/source/license.html - * - * Originally written by Mike Hamburg - */ -#ifndef __ARCH_X86_64_ARCH_INTRINSICS_H__ -#define __ARCH_X86_64_ARCH_INTRINSICS_H__ - -#define ARCH_WORD_BITS 64 - -#include - -/* FUTURE: autogenerate */ -static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) -{ - uint64_t c, d; - - #ifndef __BMI2__ - __asm__ volatile - ("movq %[a], %%rax;" - "mulq %[b];" - : [c]"=&a"(c), [d]"=d"(d) - : [b]"m"(*b), [a]"m"(*a) - : "cc"); - #else - __asm__ volatile - ("movq %[a], %%rdx;" - "mulx %[b], %[c], %[d];" - : [c]"=r"(c), [d]"=r"(d) - : [b]"m"(*b), [a]"m"(*a) - : "rdx"); - #endif - return (((__uint128_t)(d)) << 64) | c; -} - -static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) -{ - uint64_t c, d; - - #ifndef __BMI2__ - __asm__ volatile - ("movq %[a], %%rax;" - "mulq %[b];" - : [c]"=&a"(c), [d]"=d"(d) - : [b]"m"(*b), [a]"r"(a) - : "cc"); - #else - __asm__ volatile - ("mulx %[b], %[c], %[d];" - : [c]"=r"(c), [d]"=r"(d) - : [b]"m"(*b), [a]"d"(a)); - #endif - return (((__uint128_t)(d)) << 64) | c; -} - -static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b) -{ - uint64_t c, d; - - #ifndef __BMI2__ - __asm__ volatile - ("mulq %[b];" - : [c]"=a"(c), [d]"=d"(d) - : [b]"r"(b), "a"(a) - : "cc"); - #else - __asm__ volatile - ("mulx %[b], %[c], %[d];" - : [c]"=r"(c), [d]"=r"(d) - : [b]"r"(b), [a]"d"(a)); - #endif - return (((__uint128_t)(d)) << 64) | c; -} - -static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) -{ - uint64_t c, d; - - #ifndef __BMI2__ - __asm__ volatile - ("movq %[a], %%rax; " - "addq %%rax, %%rax; " - "mulq %[b];" - : [c]"=&a"(c), [d]"=d"(d) - : [b]"m"(*b), [a]"m"(*a) - : "cc"); - #else - __asm__ volatile - ("movq %[a], %%rdx;" - "leaq (,%%rdx,2), %%rdx;" - "mulx %[b], %[c], %[d];" - : [c]"=r"(c), [d]"=r"(d) - : [b]"m"(*b), [a]"m"(*a) - : "rdx"); - #endif - return (((__uint128_t)(d)) << 64) | c; -} - -static __inline__ void mac(__uint128_t *acc, const uint64_t *a, - const uint64_t *b) -{ - uint64_t lo = *acc, hi = *acc >> 64; - - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx; " - "mulx %[b], %[c], %[d]; " - "addq %[c], %[lo]; " - "adcq %[d], %[hi]; " - : [c]"=&r"(c), [d]"=&r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "mulq %[b]; " - "addq %%rax, %[lo]; " - "adcq %%rdx, %[hi]; " - : [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rax", "rdx", "cc"); - #endif - - *acc = (((__uint128_t)(hi)) << 64) | lo; -} - -static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, - const uint64_t *a, const uint64_t *b) -{ - uint64_t lo = *acc, hi = *acc >> 64; - uint64_t lo2 = *acc2, hi2 = *acc2 >> 64; - - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx; " - "mulx %[b], %[c], %[d]; " - "addq %[c], %[lo]; " - "adcq %[d], %[hi]; " - "addq %[c], %[lo2]; " - "adcq %[d], %[hi2]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "mulq %[b]; " - "addq %%rax, %[lo]; " - "adcq %%rdx, %[hi]; " - "addq %%rax, %[lo2]; " - "adcq %%rdx, %[hi2]; " - : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2) - : [b]"m"(*b), [a]"m"(*a) - : "rax", "rdx", "cc"); - #endif - - *acc = (((__uint128_t)(hi)) << 64) | lo; - *acc2 = (((__uint128_t)(hi2)) << 64) | lo2; -} - -static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) -{ - uint64_t lo = *acc, hi = *acc >> 64; - - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("mulx %[b], %[c], %[d]; " - "addq %[c], %[lo]; " - "adcq %[d], %[hi]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"d"(a) - : "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "mulq %[b]; " - "addq %%rax, %[lo]; " - "adcq %%rdx, %[hi]; " - : [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"r"(a) - : "rax", "rdx", "cc"); - #endif - - *acc = (((__uint128_t)(hi)) << 64) | lo; -} - -static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b) -{ - uint64_t lo = *acc, hi = *acc >> 64; - - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("mulx %[b], %[c], %[d]; " - "addq %[c], %[lo]; " - "adcq %[d], %[hi]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"r"(b), [a]"d"(a) - : "cc"); - #else - __asm__ volatile - ("mulq %[b]; " - "addq %%rax, %[lo]; " - "adcq %%rdx, %[hi]; " - : [lo]"+r"(lo), [hi]"+r"(hi), "+a"(a) - : [b]"r"(b) - : "rdx", "cc"); - #endif - - *acc = (((__uint128_t)(hi)) << 64) | lo; -} - -static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, - const uint64_t *b) -{ - uint64_t lo = *acc, hi = *acc >> 64; - - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx; " - "addq %%rdx, %%rdx; " - "mulx %[b], %[c], %[d]; " - "addq %[c], %[lo]; " - "adcq %[d], %[hi]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "addq %%rax, %%rax; " - "mulq %[b]; " - "addq %%rax, %[lo]; " - "adcq %%rdx, %[hi]; " - : [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rax", "rdx", "cc"); - #endif - - *acc = (((__uint128_t)(hi)) << 64) | lo; -} - -static __inline__ void msb(__uint128_t *acc, const uint64_t *a, - const uint64_t *b) -{ - uint64_t lo = *acc, hi = *acc >> 64; - - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx; " - "mulx %[b], %[c], %[d]; " - "subq %[c], %[lo]; " - "sbbq %[d], %[hi]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "mulq %[b]; " - "subq %%rax, %[lo]; " - "sbbq %%rdx, %[hi]; " - : [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rax", "rdx", "cc"); - #endif - *acc = (((__uint128_t)(hi)) << 64) | lo; -} - -static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, - const uint64_t *b) -{ - uint64_t lo = *acc, hi = *acc >> 64; - - #ifdef __BMI2__ - uint64_t c,d; - __asm__ volatile - ("movq %[a], %%rdx; " - "addq %%rdx, %%rdx; " - "mulx %[b], %[c], %[d]; " - "subq %[c], %[lo]; " - "sbbq %[d], %[hi]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - #else - __asm__ volatile - ("movq %[a], %%rax; " - "addq %%rax, %%rax; " - "mulq %[b]; " - "subq %%rax, %[lo]; " - "sbbq %%rdx, %[hi]; " - : [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rax", "rdx", "cc"); - #endif - *acc = (((__uint128_t)(hi))<<64) | lo; - -} - -static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, - const uint64_t *b) -{ - uint64_t c,d, lo = *acc, hi = *acc >> 64; - __asm__ volatile - ("movq %[a], %%rdx; " - "mulx %[b], %[c], %[d]; " - "subq %[lo], %[c]; " - "sbbq %[hi], %[d]; " - : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi) - : [b]"m"(*b), [a]"m"(*a) - : "rdx", "cc"); - *acc = (((__uint128_t)(d)) << 64) | c; -} - -static __inline__ uint64_t word_is_zero(uint64_t x) -{ - __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x)); - return ~x; -} - -static inline uint64_t shrld(__uint128_t x, int n) -{ - return x >> n; -} - -#endif /* __ARCH_X86_64_ARCH_INTRINSICS_H__ */ diff --git a/crypto/ec/curve448/arch_x86_64/f_impl.c b/crypto/ec/curve448/arch_x86_64/f_impl.c deleted file mode 100644 index c716894f26274413c8e3bcbcbf507ce63f640d8e..0000000000000000000000000000000000000000 --- a/crypto/ec/curve448/arch_x86_64/f_impl.c +++ /dev/null @@ -1,308 +0,0 @@ -/* - * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. - * Copyright 2014 Cryptography Research, Inc. - * - * Licensed under the OpenSSL license (the "License"). You may not use - * this file except in compliance with the License. You can obtain a copy - * in the file LICENSE in the source distribution or at - * https://www.openssl.org/source/license.html - * - * Originally written by Mike Hamburg - */ - -#include "field.h" - -void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs) -{ - const uint64_t *a = as->limb, *b = bs->limb; - uint64_t *c = cs->limb; - __uint128_t accum0 = 0, accum1 = 0, accum2; - uint64_t mask = (1ull << 56) - 1; - uint64_t aa[4] VECTOR_ALIGNED, bb[4] VECTOR_ALIGNED, bbb[4] VECTOR_ALIGNED; - - /* For some reason clang doesn't vectorize this without prompting? */ - unsigned int i; - for (i = 0; i < sizeof(aa) / sizeof(uint64xn_t); i++) { - ((uint64xn_t *) aa)[i] = - ((const uint64xn_t *)a)[i] + ((const uint64xn_t *)(&a[4]))[i]; - ((uint64xn_t *) bb)[i] = - ((const uint64xn_t *)b)[i] + ((const uint64xn_t *)(&b[4]))[i]; - ((uint64xn_t *) bbb)[i] = - ((const uint64xn_t *)bb)[i] + ((const uint64xn_t *)(&b[4]))[i]; - } - /* - * for (int i=0; i<4; i++) { aa[i] = a[i] + a[i+4]; bb[i] = b[i] + b[i+4]; - * } - */ - - accum2 = widemul(&a[0], &b[3]); - accum0 = widemul(&aa[0], &bb[3]); - accum1 = widemul(&a[4], &b[7]); - - mac(&accum2, &a[1], &b[2]); - mac(&accum0, &aa[1], &bb[2]); - mac(&accum1, &a[5], &b[6]); - - mac(&accum2, &a[2], &b[1]); - mac(&accum0, &aa[2], &bb[1]); - mac(&accum1, &a[6], &b[5]); - - mac(&accum2, &a[3], &b[0]); - mac(&accum0, &aa[3], &bb[0]); - mac(&accum1, &a[7], &b[4]); - - accum0 -= accum2; - accum1 += accum2; - - c[3] = ((uint64_t)(accum1)) & mask; - c[7] = ((uint64_t)(accum0)) & mask; - - accum0 >>= 56; - accum1 >>= 56; - - mac(&accum0, &aa[1], &bb[3]); - mac(&accum1, &a[5], &b[7]); - mac(&accum0, &aa[2], &bb[2]); - mac(&accum1, &a[6], &b[6]); - mac(&accum0, &aa[3], &bb[1]); - accum1 += accum0; - - accum2 = widemul(&a[0], &b[0]); - accum1 -= accum2; - accum0 += accum2; - - msb(&accum0, &a[1], &b[3]); - msb(&accum0, &a[2], &b[2]); - mac(&accum1, &a[7], &b[5]); - msb(&accum0, &a[3], &b[1]); - mac(&accum1, &aa[0], &bb[0]); - mac(&accum0, &a[4], &b[4]); - - c[0] = ((uint64_t)(accum0)) & mask; - c[4] = ((uint64_t)(accum1)) & mask; - - accum0 >>= 56; - accum1 >>= 56; - - accum2 = widemul(&a[2], &b[7]); - mac(&accum0, &a[6], &bb[3]); - mac(&accum1, &aa[2], &bbb[3]); - - mac(&accum2, &a[3], &b[6]); - mac(&accum0, &a[7], &bb[2]); - mac(&accum1, &aa[3], &bbb[2]); - - mac(&accum2, &a[0], &b[1]); - mac(&accum1, &aa[0], &bb[1]); - mac(&accum0, &a[4], &b[5]); - - mac(&accum2, &a[1], &b[0]); - mac(&accum1, &aa[1], &bb[0]); - mac(&accum0, &a[5], &b[4]); - - accum1 -= accum2; - accum0 += accum2; - - c[1] = ((uint64_t)(accum0)) & mask; - c[5] = ((uint64_t)(accum1)) & mask; - - accum0 >>= 56; - accum1 >>= 56; - - accum2 = widemul(&a[3], &b[7]); - mac(&accum0, &a[7], &bb[3]); - mac(&accum1, &aa[3], &bbb[3]); - - mac(&accum2, &a[0], &b[2]); - mac(&accum1, &aa[0], &bb[2]); - mac(&accum0, &a[4], &b[6]); - - mac(&accum2, &a[1], &b[1]); - mac(&accum1, &aa[1], &bb[1]); - mac(&accum0, &a[5], &b[5]); - - mac(&accum2, &a[2], &b[0]); - mac(&accum1, &aa[2], &bb[0]); - mac(&accum0, &a[6], &b[4]); - - accum1 -= accum2; - accum0 += accum2; - - c[2] = ((uint64_t)(accum0)) & mask; - c[6] = ((uint64_t)(accum1)) & mask; - - accum0 >>= 56; - accum1 >>= 56; - - accum0 += c[3]; - accum1 += c[7]; - c[3] = ((uint64_t)(accum0)) & mask; - c[7] = ((uint64_t)(accum1)) & mask; - - /* we could almost stop here, but it wouldn't be stable, so... */ - - accum0 >>= 56; - accum1 >>= 56; - c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1)); - c[0] += ((uint64_t)(accum1)); -} - -void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b) -{ - const uint64_t *a = as->limb; - uint64_t *c = cs->limb; - - __uint128_t accum0, accum4; - uint64_t mask = (1ull << 56) - 1; - - accum0 = widemul_rm(b, &a[0]); - accum4 = widemul_rm(b, &a[4]); - - c[0] = accum0 & mask; - accum0 >>= 56; - c[4] = accum4 & mask; - accum4 >>= 56; - - mac_rm(&accum0, b, &a[1]); - mac_rm(&accum4, b, &a[5]); - - c[1] = accum0 & mask; - accum0 >>= 56; - c[5] = accum4 & mask; - accum4 >>= 56; - - mac_rm(&accum0, b, &a[2]); - mac_rm(&accum4, b, &a[6]); - - c[2] = accum0 & mask; - accum0 >>= 56; - c[6] = accum4 & mask; - accum4 >>= 56; - - mac_rm(&accum0, b, &a[3]); - mac_rm(&accum4, b, &a[7]); - - c[3] = accum0 & mask; - accum0 >>= 56; - c[7] = accum4 & mask; - accum4 >>= 56; - - accum0 += accum4 + c[4]; - c[4] = accum0 & mask; - c[5] += accum0 >> 56; - - accum4 += c[0]; - c[0] = accum4 & mask; - c[1] += accum4 >> 56; -} - -void gf_sqr(gf_s * __restrict__ cs, const gf as) -{ - const uint64_t *a = as->limb; - uint64_t *c = cs->limb; - __uint128_t accum0 = 0, accum1 = 0, accum2; - uint64_t mask = (1ull << 56) - 1; - uint64_t aa[4] VECTOR_ALIGNED; - - /* For some reason clang doesn't vectorize this without prompting? */ - unsigned int i; - for (i = 0; i < sizeof(aa) / sizeof(uint64xn_t); i++) { - ((uint64xn_t *) aa)[i] = - ((const uint64xn_t *)a)[i] + ((const uint64xn_t *)(&a[4]))[i]; - } - - accum2 = widemul(&a[0], &a[3]); - accum0 = widemul(&aa[0], &aa[3]); - accum1 = widemul(&a[4], &a[7]); - - mac(&accum2, &a[1], &a[2]); - mac(&accum0, &aa[1], &aa[2]); - mac(&accum1, &a[5], &a[6]); - - accum0 -= accum2; - accum1 += accum2; - - c[3] = ((uint64_t)(accum1)) << 1 & mask; - c[7] = ((uint64_t)(accum0)) << 1 & mask; - - accum0 >>= 55; - accum1 >>= 55; - - mac2(&accum0, &aa[1], &aa[3]); - mac2(&accum1, &a[5], &a[7]); - mac(&accum0, &aa[2], &aa[2]); - accum1 += accum0; - - msb2(&accum0, &a[1], &a[3]); - mac(&accum1, &a[6], &a[6]); - - accum2 = widemul(&a[0], &a[0]); - accum1 -= accum2; - accum0 += accum2; - - msb(&accum0, &a[2], &a[2]); - mac(&accum1, &aa[0], &aa[0]); - mac(&accum0, &a[4], &a[4]); - - c[0] = ((uint64_t)(accum0)) & mask; - c[4] = ((uint64_t)(accum1)) & mask; - - accum0 >>= 56; - accum1 >>= 56; - - accum2 = widemul2(&aa[2], &aa[3]); - msb2(&accum0, &a[2], &a[3]); - mac2(&accum1, &a[6], &a[7]); - - accum1 += accum2; - accum0 += accum2; - - accum2 = widemul2(&a[0], &a[1]); - mac2(&accum1, &aa[0], &aa[1]); - mac2(&accum0, &a[4], &a[5]); - - accum1 -= accum2; - accum0 += accum2; - - c[1] = ((uint64_t)(accum0)) & mask; - c[5] = ((uint64_t)(accum1)) & mask; - - accum0 >>= 56; - accum1 >>= 56; - - accum2 = widemul(&aa[3], &aa[3]); - msb(&accum0, &a[3], &a[3]); - mac(&accum1, &a[7], &a[7]); - - accum1 += accum2; - accum0 += accum2; - - accum2 = widemul2(&a[0], &a[2]); - mac2(&accum1, &aa[0], &aa[2]); - mac2(&accum0, &a[4], &a[6]); - - mac(&accum2, &a[1], &a[1]); - mac(&accum1, &aa[1], &aa[1]); - mac(&accum0, &a[5], &a[5]); - - accum1 -= accum2; - accum0 += accum2; - - c[2] = ((uint64_t)(accum0)) & mask; - c[6] = ((uint64_t)(accum1)) & mask; - - accum0 >>= 56; - accum1 >>= 56; - - accum0 += c[3]; - accum1 += c[7]; - c[3] = ((uint64_t)(accum0)) & mask; - c[7] = ((uint64_t)(accum1)) & mask; - - /* we could almost stop here, but it wouldn't be stable, so... */ - - accum0 >>= 56; - accum1 >>= 56; - c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1)); - c[0] += ((uint64_t)(accum1)); -} diff --git a/crypto/ec/curve448/arch_x86_64/f_impl.h b/crypto/ec/curve448/arch_x86_64/f_impl.h deleted file mode 100644 index b124577832e408a63d51a251d8834d66dcb0f3c2..0000000000000000000000000000000000000000 --- a/crypto/ec/curve448/arch_x86_64/f_impl.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. - * Copyright 2014-2016 Cryptography Research, Inc. - * - * Licensed under the OpenSSL license (the "License"). You may not use - * this file except in compliance with the License. You can obtain a copy - * in the file LICENSE in the source distribution or at - * https://www.openssl.org/source/license.html - * - * Originally written by Mike Hamburg - */ - -#define GF_HEADROOM 60 -#define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}} -#define LIMB_PLACE_VALUE(i) 56 - -void gf_add_RAW(gf out, const gf a, const gf b) -{ - for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint64xn_t); i++) { - ((uint64xn_t *) out)[i] = - ((const uint64xn_t *)a)[i] + ((const uint64xn_t *)b)[i]; - } -} - -void gf_sub_RAW(gf out, const gf a, const gf b) -{ - for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint64xn_t); i++) { - ((uint64xn_t *) out)[i] = - ((const uint64xn_t *)a)[i] - ((const uint64xn_t *)b)[i]; - } -} - -void gf_bias(gf a, int amt) -{ - uint64_t co1 = ((1ull << 56) - 1) * amt, co2 = co1 - amt; - -#if __AVX2__ - uint64x4_t lo = { co1, co1, co1, co1 }, hi = { - co2, co1, co1, co1}; - uint64x4_t *aa = (uint64x4_t *) a; - aa[0] += lo; - aa[1] += hi; -#elif __SSE2__ - uint64x2_t lo = { co1, co1 }, hi = { - co2, co1}; - uint64x2_t *aa = (uint64x2_t *) a; - aa[0] += lo; - aa[1] += lo; - aa[2] += hi; - aa[3] += lo; -#else - for (unsigned int i = 0; i < sizeof(*a) / sizeof(uint64_t); i++) { - a->limb[i] += (i == 4) ? co2 : co1; - } -#endif -} - -void gf_weak_reduce(gf a) -{ - /* PERF: use pshufb/palignr if anyone cares about speed of this */ - uint64_t mask = (1ull << 56) - 1; - uint64_t tmp = a->limb[7] >> 56; - - a->limb[4] += tmp; - for (unsigned int i = 7; i > 0; i--) { - a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 56); - } - a->limb[0] = (a->limb[0] & mask) + tmp; -}