Remove curve448 architecture specific files

Remove all architecture specific files except for the reference arch_32 version. These files provide archicture specific performance optimisation. However they have not been integrated yet. In order to avoid review issues they are removed for now. They may be reintroduced at a later time. Reviewed-by: N Bernd Edlinger <bernd.edlinger@hotmail.de> (Merged from https://github.com/openssl/openssl/pull/5105)

Remove curve448 architecture specific files
Remove all architecture specific files except for the reference arch_32 version. These files provide archicture specific performance optimisation. However they have not been integrated yet. In order to avoid review issues they are removed for now. They may be reintroduced at a later time. Reviewed-by: N Bernd Edlinger <bernd.edlinger@hotmail.de> (Merged from https://github.com/openssl/openssl/pull/5105)
7e492f33 · Matt Caswell · 0cdcdacc · 0cdcdacc · 0cdcdacc · 0cdcdacc
12 changed file
--- a/crypto/ec/curve448/arch_arm_32/arch_intrinsics.h
+++ b/crypto/ec/curve448/arch_arm_32/arch_intrinsics.h
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2016 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License").  You may not use
- * this file except in compliance with the License.  You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-
-#ifndef __ARCH_ARM_32_ARCH_INTRINSICS_H__
-# define __ARCH_ARM_32_ARCH_INTRINSICS_H__
-
-# define ARCH_WORD_BITS 32
-
-static __inline__ __attribute((always_inline, unused))
-uint32_t word_is_zero(uint32_t a)
-{
-    uint32_t ret;
-
-    asm("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc");
-    return ret;
-}
-
-static __inline__ __attribute((always_inline, unused))
-uint64_t widemul(uint32_t a, uint32_t b)
-{
-    /*
-     * Could be UMULL, but it's hard to express to CC that the registers must
-     * be different
-     */
-    return ((uint64_t)a) * b;
-}
-
-#endif                          /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
--- a/crypto/ec/curve448/arch_arm_32/f_impl.c
+++ b/crypto/ec/curve448/arch_arm_32/f_impl.c
--- a/crypto/ec/curve448/arch_arm_32/f_impl.h
+++ b/crypto/ec/curve448/arch_arm_32/f_impl.h
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2014-2016 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License").  You may not use
- * this file except in compliance with the License.  You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-
-#define GF_HEADROOM 2
-#define LIMB(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
-#define FIELD_LITERAL(a,b,c,d,e,f,g,h) \
-    {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e),LIMB(f),LIMB(g),LIMB(h)}}
-
-#define LIMB_PLACE_VALUE(i) 28
-
-void gf_add_RAW(gf out, const gf a, const gf b)
-{
-    for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
-        ((uint32xn_t *) out)[i] =
-            ((const uint32xn_t *)a)[i] + ((const uint32xn_t *)b)[i];
-    }
-}
-
-void gf_sub_RAW(gf out, const gf a, const gf b)
-{
-    for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
-        ((uint32xn_t *) out)[i] =
-            ((const uint32xn_t *)a)[i] - ((const uint32xn_t *)b)[i];
-    }
-}
-
-void gf_bias(gf a, int amt)
-{
-    uint32_t co1 = ((1ull << 28) - 1) * amt, co2 = co1 - amt;
-    uint32x4_t lo = { co1, co1, co1, co1 }, hi = {
-    co2, co1, co1, co1};
-    uint32x4_t *aa = (uint32x4_t *) a;
-
-    aa[0] += lo;
-    aa[1] += lo;
-    aa[2] += hi;
-    aa[3] += lo;
-}
-
-void gf_weak_reduce(gf a)
-{
-    uint64_t mask = (1ull << 28) - 1;
-    uint64_t tmp = a->limb[15] >> 28;
-
-    a->limb[8] += tmp;
-    for (unsigned int i = 15; i > 0; i--) {
-        a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 28);
-    }
-    a->limb[0] = (a->limb[0] & mask) + tmp;
-}
--- a/crypto/ec/curve448/arch_neon/arch_intrinsics.h
+++ b/crypto/ec/curve448/arch_neon/arch_intrinsics.h
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2016 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License").  You may not use
- * this file except in compliance with the License.  You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-
-#ifndef __ARCH_NEON_ARCH_INTRINSICS_H__
-# define __ARCH_NEON_ARCH_INTRINSICS_H__
-
-# define ARCH_WORD_BITS 32
-
-static __inline__ __attribute((always_inline, unused))
-uint32_t word_is_zero(uint32_t a)
-{
-    uint32_t ret;
-    __asm__("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc");
-    return ret;
-}
-
-static __inline__ __attribute((always_inline, unused))
-uint64_t widemul(uint32_t a, uint32_t b)
-{
-    /*
-     * Could be UMULL, but it's hard to express to CC that the registers must
-     * be different
-     */
-    return ((uint64_t)a) * b;
-}
-
-#endif                          /* __ARCH_NEON_ARCH_INTRINSICS_H__ */
--- a/crypto/ec/curve448/arch_neon/f_impl.c
+++ b/crypto/ec/curve448/arch_neon/f_impl.c
--- a/crypto/ec/curve448/arch_neon/f_impl.h
+++ b/crypto/ec/curve448/arch_neon/f_impl.h
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2014-2016 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License").  You may not use
- * this file except in compliance with the License.  You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-
-#define GF_HEADROOM 2
-#define LIMBPERM(x) (((x)<<1 | (x)>>3) & 15)
-#define USE_NEON_PERM 1
-#define LIMBHI(x) ((x##ull)>>28)
-#define LIMBLO(x) ((x##ull)&((1ull<<28)-1))
-#define FIELD_LITERAL(a,b,c,d,e,f,g,h) { \
-    { \
-        LIMBLO(a), LIMBLO(e), LIMBHI(a), LIMBHI(e), LIMBLO(b), LIMBLO(f), \
-        LIMBHI(b), LIMBHI(f), LIMBLO(c), LIMBLO(g), LIMBHI(c), LIMBHI(g), \
-        LIMBLO(d), LIMBLO(h), LIMBHI(d), LIMBHI(h) \
-    } \
-}
-
-#define LIMB_PLACE_VALUE(i) 28
-
-void gf_add_RAW(gf out, const gf a, const gf b)
-{
-    for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
-        ((uint32xn_t *) out)[i] =
-            ((const uint32xn_t *)a)[i] + ((const uint32xn_t *)b)[i];
-    }
-}
-
-void gf_sub_RAW(gf out, const gf a, const gf b)
-{
-    for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
-        ((uint32xn_t *) out)[i] =
-            ((const uint32xn_t *)a)[i] - ((const uint32xn_t *)b)[i];
-    }
-}
-
-void gf_bias(gf a, int amt)
-{
-    uint32_t co1 = ((1ull << 28) - 1) * amt, co2 = co1 - amt;
-    uint32x4_t lo = { co1, co2, co1, co1 };
-    uint32x4_t hi = { co1, co1, co1, co1 };
-    uint32x4_t *aa = (uint32x4_t *) a;
-    aa[0] += lo;
-    aa[1] += hi;
-    aa[2] += hi;
-    aa[3] += hi;
-}
-
-void gf_weak_reduce(gf a)
-{
-    uint32x2_t *aa = (uint32x2_t *) a;
-    uint32x2_t vmask = { (1ull << 28) - 1, (1ull << 28) - 1};
-    uint32x2_t vm2 = { 0, -1}, tmp = vshr_n_u32(aa[7], 28);
-
-    for (unsigned int i = 7; i >= 1; i--)
-        aa[i] = vsra_n_u32(aa[i] & vmask, aa[i - 1], 28);
-    aa[0] = (aa[0] & vmask) + vrev64_u32(tmp) + (tmp & vm2);
-}
--- a/crypto/ec/curve448/arch_ref64/arch_intrinsics.h
+++ b/crypto/ec/curve448/arch_ref64/arch_intrinsics.h
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2016 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License").  You may not use
- * this file except in compliance with the License.  You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-
-#ifndef __ARCH_REF64_ARCH_INTRINSICS_H__
-# define __ARCH_REF64_ARCH_INTRINSICS_H__
-
-# define ARCH_WORD_BITS 64
-
-static __inline__ __attribute((always_inline, unused))
-uint64_t word_is_zero(uint64_t a)
-{
-    /* let's hope the compiler isn't clever enough to optimize this. */
-    return (((__uint128_t) a) - 1) >> 64;
-}
-
-static __inline__ __attribute((always_inline, unused))
-__uint128_t widemul(uint64_t a, uint64_t b)
-{
-    return ((__uint128_t) a) * b;
-}
-
-#endif                          /* ARCH_REF64_ARCH_INTRINSICS_H__ */
--- a/crypto/ec/curve448/arch_ref64/f_impl.c
+++ b/crypto/ec/curve448/arch_ref64/f_impl.c
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2014 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License").  You may not use
- * this file except in compliance with the License.  You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-#include "field.h"
-
-void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
-{
-    const uint64_t *a = as->limb, *b = bs->limb;
-    uint64_t *c = cs->limb;
-    __uint128_t accum0 = 0, accum1 = 0, accum2;
-    uint64_t mask = (1ull << 56) - 1;
-    uint64_t aa[4], bb[4], bbb[4];
-    unsigned int i;
-
-    for (i = 0; i < 4; i++) {
-        aa[i] = a[i] + a[i + 4];
-        bb[i] = b[i] + b[i + 4];
-        bbb[i] = bb[i] + b[i + 4];
-    }
-
-    int I_HATE_UNROLLED_LOOPS = 0;
-
-    if (I_HATE_UNROLLED_LOOPS) {
-        /*
-         * The compiler probably won't unroll this, so it's like 80% slower.
-         */
-        for (i = 0; i < 4; i++) {
-            accum2 = 0;
-
-            unsigned int j;
-            for (j = 0; j <= i; j++) {
-                accum2 += widemul(a[j], b[i - j]);
-                accum1 += widemul(aa[j], bb[i - j]);
-                accum0 += widemul(a[j + 4], b[i - j + 4]);
-            }
-            for (; j < 4; j++) {
-                accum2 += widemul(a[j], b[i - j + 8]);
-                accum1 += widemul(aa[j], bbb[i - j + 4]);
-                accum0 += widemul(a[j + 4], bb[i - j + 4]);
-            }
-
-            accum1 -= accum2;
-            accum0 += accum2;
-
-            c[i] = ((uint64_t)(accum0)) & mask;
-            c[i + 4] = ((uint64_t)(accum1)) & mask;
-
-            accum0 >>= 56;
-            accum1 >>= 56;
-        }
-    } else {
-        accum2 = widemul(a[0], b[0]);
-        accum1 += widemul(aa[0], bb[0]);
-        accum0 += widemul(a[4], b[4]);
-
-        accum2 += widemul(a[1], b[7]);
-        accum1 += widemul(aa[1], bbb[3]);
-        accum0 += widemul(a[5], bb[3]);
-
-        accum2 += widemul(a[2], b[6]);
-        accum1 += widemul(aa[2], bbb[2]);
-        accum0 += widemul(a[6], bb[2]);
-
-        accum2 += widemul(a[3], b[5]);
-        accum1 += widemul(aa[3], bbb[1]);
-        accum0 += widemul(a[7], bb[1]);
-
-        accum1 -= accum2;
-        accum0 += accum2;
-
-        c[0] = ((uint64_t)(accum0)) & mask;
-        c[4] = ((uint64_t)(accum1)) & mask;
-
-        accum0 >>= 56;
-        accum1 >>= 56;
-
-        accum2 = widemul(a[0], b[1]);
-        accum1 += widemul(aa[0], bb[1]);
-        accum0 += widemul(a[4], b[5]);
-
-        accum2 += widemul(a[1], b[0]);
-        accum1 += widemul(aa[1], bb[0]);
-        accum0 += widemul(a[5], b[4]);
-
-        accum2 += widemul(a[2], b[7]);
-        accum1 += widemul(aa[2], bbb[3]);
-        accum0 += widemul(a[6], bb[3]);
-
-        accum2 += widemul(a[3], b[6]);
-        accum1 += widemul(aa[3], bbb[2]);
-        accum0 += widemul(a[7], bb[2]);
-
-        accum1 -= accum2;
-        accum0 += accum2;
-
-        c[1] = ((uint64_t)(accum0)) & mask;
-        c[5] = ((uint64_t)(accum1)) & mask;
-
-        accum0 >>= 56;
-        accum1 >>= 56;
-
-        accum2 = widemul(a[0], b[2]);
-        accum1 += widemul(aa[0], bb[2]);
-        accum0 += widemul(a[4], b[6]);
-
-        accum2 += widemul(a[1], b[1]);
-        accum1 += widemul(aa[1], bb[1]);
-        accum0 += widemul(a[5], b[5]);
-
-        accum2 += widemul(a[2], b[0]);
-        accum1 += widemul(aa[2], bb[0]);
-        accum0 += widemul(a[6], b[4]);
-
-        accum2 += widemul(a[3], b[7]);
-        accum1 += widemul(aa[3], bbb[3]);
-        accum0 += widemul(a[7], bb[3]);
-
-        accum1 -= accum2;
-        accum0 += accum2;
-
-        c[2] = ((uint64_t)(accum0)) & mask;
-        c[6] = ((uint64_t)(accum1)) & mask;
-
-        accum0 >>= 56;
-        accum1 >>= 56;
-
-        accum2 = widemul(a[0], b[3]);
-        accum1 += widemul(aa[0], bb[3]);
-        accum0 += widemul(a[4], b[7]);
-
-        accum2 += widemul(a[1], b[2]);
-        accum1 += widemul(aa[1], bb[2]);
-        accum0 += widemul(a[5], b[6]);
-
-        accum2 += widemul(a[2], b[1]);
-        accum1 += widemul(aa[2], bb[1]);
-        accum0 += widemul(a[6], b[5]);
-
-        accum2 += widemul(a[3], b[0]);
-        accum1 += widemul(aa[3], bb[0]);
-        accum0 += widemul(a[7], b[4]);
-
-        accum1 -= accum2;
-        accum0 += accum2;
-
-        c[3] = ((uint64_t)(accum0)) & mask;
-        c[7] = ((uint64_t)(accum1)) & mask;
-
-        accum0 >>= 56;
-        accum1 >>= 56;
-    }                           /* !I_HATE_UNROLLED_LOOPS */
-
-    accum0 += accum1;
-    accum0 += c[4];
-    accum1 += c[0];
-    c[4] = ((uint64_t)(accum0)) & mask;
-    c[0] = ((uint64_t)(accum1)) & mask;
-
-    accum0 >>= 56;
-    accum1 >>= 56;
-
-    c[5] += ((uint64_t)(accum0));
-    c[1] += ((uint64_t)(accum1));
-}
-
-void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
-{
-    const uint64_t *a = as->limb;
-    uint64_t *c = cs->limb;
-    __uint128_t accum0 = 0, accum4 = 0;
-    uint64_t mask = (1ull << 56) - 1;
-    int i;
-
-    for (i = 0; i < 4; i++) {
-        accum0 += widemul(b, a[i]);
-        accum4 += widemul(b, a[i + 4]);
-        c[i] = accum0 & mask;
-        accum0 >>= 56;
-        c[i + 4] = accum4 & mask;
-        accum4 >>= 56;
-    }
-
-    accum0 += accum4 + c[4];
-    c[4] = accum0 & mask;
-    c[5] += accum0 >> 56;
-
-    accum4 += c[0];
-    c[0] = accum4 & mask;
-    c[1] += accum4 >> 56;
-}
-
-void gf_sqr(gf_s * __restrict__ cs, const gf as)
-{
-    const uint64_t *a = as->limb;
-    uint64_t *c = cs->limb;
-    __uint128_t accum0 = 0, accum1 = 0, accum2;
-    uint64_t mask = (1ull << 56) - 1;
-    uint64_t aa[4];
-
-    /* For some reason clang doesn't vectorize this without prompting? */
-    unsigned int i;
-    for (i = 0; i < 4; i++) {
-        aa[i] = a[i] + a[i + 4];
-    }
-
-    accum2 = widemul(a[0], a[3]);
-    accum0 = widemul(aa[0], aa[3]);
-    accum1 = widemul(a[4], a[7]);
-
-    accum2 += widemul(a[1], a[2]);
-    accum0 += widemul(aa[1], aa[2]);
-    accum1 += widemul(a[5], a[6]);
-
-    accum0 -= accum2;
-    accum1 += accum2;
-
-    c[3] = ((uint64_t)(accum1)) << 1 & mask;
-    c[7] = ((uint64_t)(accum0)) << 1 & mask;
-
-    accum0 >>= 55;
-    accum1 >>= 55;
-
-    accum0 += widemul(2 * aa[1], aa[3]);
-    accum1 += widemul(2 * a[5], a[7]);
-    accum0 += widemul(aa[2], aa[2]);
-    accum1 += accum0;
-
-    accum0 -= widemul(2 * a[1], a[3]);
-    accum1 += widemul(a[6], a[6]);
-
-    accum2 = widemul(a[0], a[0]);
-    accum1 -= accum2;
-    accum0 += accum2;
-
-    accum0 -= widemul(a[2], a[2]);
-    accum1 += widemul(aa[0], aa[0]);
-    accum0 += widemul(a[4], a[4]);
-
-    c[0] = ((uint64_t)(accum0)) & mask;
-    c[4] = ((uint64_t)(accum1)) & mask;
-
-    accum0 >>= 56;
-    accum1 >>= 56;
-
-    accum2 = widemul(2 * aa[2], aa[3]);
-    accum0 -= widemul(2 * a[2], a[3]);
-    accum1 += widemul(2 * a[6], a[7]);
-
-    accum1 += accum2;
-    accum0 += accum2;
-
-    accum2 = widemul(2 * a[0], a[1]);
-    accum1 += widemul(2 * aa[0], aa[1]);
-    accum0 += widemul(2 * a[4], a[5]);
-
-    accum1 -= accum2;
-    accum0 += accum2;
-
-    c[1] = ((uint64_t)(accum0)) & mask;
-    c[5] = ((uint64_t)(accum1)) & mask;
-
-    accum0 >>= 56;
-    accum1 >>= 56;
-
-    accum2 = widemul(aa[3], aa[3]);
-    accum0 -= widemul(a[3], a[3]);
-    accum1 += widemul(a[7], a[7]);
-
-    accum1 += accum2;
-    accum0 += accum2;
-
-    accum2 = widemul(2 * a[0], a[2]);
-    accum1 += widemul(2 * aa[0], aa[2]);
-    accum0 += widemul(2 * a[4], a[6]);
-
-    accum2 += widemul(a[1], a[1]);
-    accum1 += widemul(aa[1], aa[1]);
-    accum0 += widemul(a[5], a[5]);
-
-    accum1 -= accum2;
-    accum0 += accum2;
-
-    c[2] = ((uint64_t)(accum0)) & mask;
-    c[6] = ((uint64_t)(accum1)) & mask;
-
-    accum0 >>= 56;
-    accum1 >>= 56;
-
-    accum0 += c[3];
-    accum1 += c[7];
-    c[3] = ((uint64_t)(accum0)) & mask;
-    c[7] = ((uint64_t)(accum1)) & mask;
-
-    /* we could almost stop here, but it wouldn't be stable, so... */
-
-    accum0 >>= 56;
-    accum1 >>= 56;
-    c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
-    c[0] += ((uint64_t)(accum1));
-}
--- a/crypto/ec/curve448/arch_ref64/f_impl.h
+++ b/crypto/ec/curve448/arch_ref64/f_impl.h
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2014-2016 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License").  You may not use
- * this file except in compliance with the License.  You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-
-#define GF_HEADROOM 9999        /* Everything is reduced anyway */
-#define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}
-
-#define LIMB_PLACE_VALUE(i) 56
-
-void gf_add_RAW(gf out, const gf a, const gf b)
-{
-    for (unsigned int i = 0; i < 8; i++)
-        out->limb[i] = a->limb[i] + b->limb[i];
-    gf_weak_reduce(out);
-}
-
-void gf_sub_RAW(gf out, const gf a, const gf b)
-{
-    uint64_t co1 = ((1ull << 56) - 1) * 2, co2 = co1 - 2;
-
-    for (unsigned int i = 0; i < 8; i++)
-        out->limb[i] = a->limb[i] - b->limb[i] + ((i == 4) ? co2 : co1);
-    gf_weak_reduce(out);
-}
-
-void gf_bias(gf a, int amt)
-{
-    (void)a;
-    (void)amt;
-}
-
-void gf_weak_reduce(gf a)
-{
-    uint64_t mask = (1ull << 56) - 1;
-    uint64_t tmp = a->limb[7] >> 56;
-
-    a->limb[4] += tmp;
-    for (unsigned int i = 7; i > 0; i--)
-        a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 56);
-    a->limb[0] = (a->limb[0] & mask) + tmp;
-}
--- a/crypto/ec/curve448/arch_x86_64/arch_intrinsics.h
+++ b/crypto/ec/curve448/arch_x86_64/arch_intrinsics.h
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2014-2016 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License").  You may not use
- * this file except in compliance with the License.  You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-#ifndef __ARCH_X86_64_ARCH_INTRINSICS_H__
-#define __ARCH_X86_64_ARCH_INTRINSICS_H__
-
-#define ARCH_WORD_BITS 64
-
-#include <openssl/e_os2.h>
-
-/* FUTURE: autogenerate */
-static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b)
-{
-  uint64_t c, d;
-
-  #ifndef __BMI2__
-      __asm__ volatile
-          ("movq %[a], %%rax;"
-           "mulq %[b];"
-           : [c]"=&a"(c), [d]"=d"(d)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "cc");
-  #else
-      __asm__ volatile
-          ("movq %[a], %%rdx;"
-           "mulx %[b], %[c], %[d];"
-           : [c]"=r"(c), [d]"=r"(d)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rdx");
-  #endif
-  return (((__uint128_t)(d)) << 64) | c;
-}
-
-static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b)
-{
-  uint64_t c, d;
-
-  #ifndef __BMI2__
-      __asm__ volatile
-          ("movq %[a], %%rax;"
-           "mulq %[b];"
-           : [c]"=&a"(c), [d]"=d"(d)
-           : [b]"m"(*b), [a]"r"(a)
-           : "cc");
-  #else
-      __asm__ volatile
-          ("mulx %[b], %[c], %[d];"
-           : [c]"=r"(c), [d]"=r"(d)
-           : [b]"m"(*b), [a]"d"(a));
-  #endif
-  return (((__uint128_t)(d)) << 64) | c;
-}
-
-static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b)
-{
-  uint64_t c, d;
-
-  #ifndef __BMI2__
-      __asm__ volatile
-          ("mulq %[b];"
-           : [c]"=a"(c), [d]"=d"(d)
-           : [b]"r"(b), "a"(a)
-           : "cc");
-  #else
-      __asm__ volatile
-          ("mulx %[b], %[c], %[d];"
-           : [c]"=r"(c), [d]"=r"(d)
-           : [b]"r"(b), [a]"d"(a));
-  #endif
-  return (((__uint128_t)(d)) << 64) | c;
-}
-
-static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b)
-{
-  uint64_t c, d;
-
-  #ifndef __BMI2__
-      __asm__ volatile
-          ("movq %[a], %%rax; "
-           "addq %%rax, %%rax; "
-           "mulq %[b];"
-           : [c]"=&a"(c), [d]"=d"(d)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "cc");
-  #else
-      __asm__ volatile
-          ("movq %[a], %%rdx;"
-           "leaq (,%%rdx,2), %%rdx;"
-           "mulx %[b], %[c], %[d];"
-           : [c]"=r"(c), [d]"=r"(d)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rdx");
-  #endif
-  return (((__uint128_t)(d)) << 64) | c;
-}
-
-static __inline__ void mac(__uint128_t *acc, const uint64_t *a,
-                           const uint64_t *b)
-{
-  uint64_t lo = *acc, hi = *acc >> 64;
-  
-  #ifdef __BMI2__
-      uint64_t c,d;
-      __asm__ volatile
-          ("movq %[a], %%rdx; "
-           "mulx %[b], %[c], %[d]; "
-           "addq %[c], %[lo]; "
-           "adcq %[d], %[hi]; "
-           : [c]"=&r"(c), [d]"=&r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rdx", "cc");
-  #else
-      __asm__ volatile
-          ("movq %[a], %%rax; "
-           "mulq %[b]; "
-           "addq %%rax, %[lo]; "
-           "adcq %%rdx, %[hi]; "
-           : [lo]"+r"(lo), [hi]"+r"(hi)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rax", "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi)) << 64) | lo;
-}
-
-static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2,
-                             const uint64_t *a, const uint64_t *b)
-{
-  uint64_t lo = *acc, hi = *acc >> 64;
-  uint64_t lo2 = *acc2, hi2 = *acc2 >> 64;
-  
-  #ifdef __BMI2__
-      uint64_t c,d;
-      __asm__ volatile
-          ("movq %[a], %%rdx; "
-           "mulx %[b], %[c], %[d]; "
-           "addq %[c], %[lo]; "
-           "adcq %[d], %[hi]; "
-           "addq %[c], %[lo2]; "
-           "adcq %[d], %[hi2]; "
-           : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rdx", "cc");
-  #else
-      __asm__ volatile
-          ("movq %[a], %%rax; "
-           "mulq %[b]; "
-           "addq %%rax, %[lo]; "
-           "adcq %%rdx, %[hi]; "
-           "addq %%rax, %[lo2]; "
-           "adcq %%rdx, %[hi2]; "
-           : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rax", "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi)) << 64) | lo;
-  *acc2 = (((__uint128_t)(hi2)) << 64) | lo2;
-}
-
-static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b)
-{
-  uint64_t lo = *acc, hi = *acc >> 64;
-  
-  #ifdef __BMI2__
-      uint64_t c,d;
-      __asm__ volatile
-          ("mulx %[b], %[c], %[d]; "
-           "addq %[c], %[lo]; "
-           "adcq %[d], %[hi]; "
-           : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-           : [b]"m"(*b), [a]"d"(a)
-           : "cc");
-  #else
-      __asm__ volatile
-          ("movq %[a], %%rax; "
-           "mulq %[b]; "
-           "addq %%rax, %[lo]; "
-           "adcq %%rdx, %[hi]; "
-           : [lo]"+r"(lo), [hi]"+r"(hi)
-           : [b]"m"(*b), [a]"r"(a)
-           : "rax", "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi)) << 64) | lo;
-}
-
-static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b)
-{
-  uint64_t lo = *acc, hi = *acc >> 64;
-  
-  #ifdef __BMI2__
-      uint64_t c,d;
-      __asm__ volatile
-          ("mulx %[b], %[c], %[d]; "
-           "addq %[c], %[lo]; "
-           "adcq %[d], %[hi]; "
-           : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-           : [b]"r"(b), [a]"d"(a)
-           : "cc");
-  #else
-      __asm__ volatile
-          ("mulq %[b]; "
-           "addq %%rax, %[lo]; "
-           "adcq %%rdx, %[hi]; "
-           : [lo]"+r"(lo), [hi]"+r"(hi), "+a"(a)
-           : [b]"r"(b)
-           : "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi)) << 64) | lo;
-}
-
-static __inline__ void mac2(__uint128_t *acc, const uint64_t *a,
-                            const uint64_t *b)
-{
-  uint64_t lo = *acc, hi = *acc >> 64;
-  
-  #ifdef __BMI2__
-      uint64_t c,d;
-      __asm__ volatile
-          ("movq %[a], %%rdx; "
-           "addq %%rdx, %%rdx; "
-           "mulx %[b], %[c], %[d]; "
-           "addq %[c], %[lo]; "
-           "adcq %[d], %[hi]; "
-           : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rdx", "cc");
-  #else
-      __asm__ volatile
-          ("movq %[a], %%rax; "
-           "addq %%rax, %%rax; "
-           "mulq %[b]; "
-           "addq %%rax, %[lo]; "
-           "adcq %%rdx, %[hi]; "
-           : [lo]"+r"(lo), [hi]"+r"(hi)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rax", "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi)) << 64) | lo;
-}
-
-static __inline__ void msb(__uint128_t *acc, const uint64_t *a,
-                           const uint64_t *b)
-{
-  uint64_t lo = *acc, hi = *acc >> 64;
-
-  #ifdef __BMI2__
-      uint64_t c,d;
-      __asm__ volatile
-          ("movq %[a], %%rdx; "
-           "mulx %[b], %[c], %[d]; "
-           "subq %[c], %[lo]; "
-           "sbbq %[d], %[hi]; "
-           : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rdx", "cc");
-  #else
-      __asm__ volatile
-          ("movq %[a], %%rax; "
-           "mulq %[b]; "
-           "subq %%rax, %[lo]; "
-           "sbbq %%rdx, %[hi]; "
-           : [lo]"+r"(lo), [hi]"+r"(hi)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rax", "rdx", "cc");
-  #endif
-  *acc = (((__uint128_t)(hi)) << 64) | lo;
-}
-
-static __inline__ void msb2(__uint128_t *acc, const uint64_t *a,
-                            const uint64_t *b)
-{
-  uint64_t lo = *acc, hi = *acc >> 64;
-
-  #ifdef __BMI2__
-      uint64_t c,d;
-      __asm__ volatile
-          ("movq %[a], %%rdx; "
-           "addq %%rdx, %%rdx; "
-           "mulx %[b], %[c], %[d]; "
-           "subq %[c], %[lo]; "
-           "sbbq %[d], %[hi]; "
-           : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rdx", "cc");
-  #else
-      __asm__ volatile
-          ("movq %[a], %%rax; "
-           "addq %%rax, %%rax; "
-           "mulq %[b]; "
-           "subq %%rax, %[lo]; "
-           "sbbq %%rdx, %[hi]; "
-           : [lo]"+r"(lo), [hi]"+r"(hi)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rax", "rdx", "cc");
-  #endif
-  *acc = (((__uint128_t)(hi))<<64) | lo;
-  
-}
-
-static __inline__ void mrs(__uint128_t *acc, const uint64_t *a,
-                           const uint64_t *b)
-{
-  uint64_t c,d, lo = *acc, hi = *acc >> 64;
-  __asm__ volatile
-      ("movq %[a], %%rdx; "
-       "mulx %[b], %[c], %[d]; "
-       "subq %[lo], %[c]; "
-       "sbbq %[hi], %[d]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx", "cc");
-  *acc = (((__uint128_t)(d)) << 64) | c;
-}
-
-static __inline__ uint64_t word_is_zero(uint64_t x)
-{
-  __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
-  return ~x;
-}
-
-static inline uint64_t shrld(__uint128_t x, int n)
-{
-    return x >> n;
-}
-
-#endif /* __ARCH_X86_64_ARCH_INTRINSICS_H__ */
--- a/crypto/ec/curve448/arch_x86_64/f_impl.c
+++ b/crypto/ec/curve448/arch_x86_64/f_impl.c
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2014 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License").  You may not use
- * this file except in compliance with the License.  You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-
-#include "field.h"
-
-void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
-{
-    const uint64_t *a = as->limb, *b = bs->limb;
-    uint64_t *c = cs->limb;
-    __uint128_t accum0 = 0, accum1 = 0, accum2;
-    uint64_t mask = (1ull << 56) - 1;
-    uint64_t aa[4] VECTOR_ALIGNED, bb[4] VECTOR_ALIGNED, bbb[4] VECTOR_ALIGNED;
-
-    /* For some reason clang doesn't vectorize this without prompting? */
-    unsigned int i;
-    for (i = 0; i < sizeof(aa) / sizeof(uint64xn_t); i++) {
-        ((uint64xn_t *) aa)[i] =
-            ((const uint64xn_t *)a)[i] + ((const uint64xn_t *)(&a[4]))[i];
-        ((uint64xn_t *) bb)[i] =
-            ((const uint64xn_t *)b)[i] + ((const uint64xn_t *)(&b[4]))[i];
-        ((uint64xn_t *) bbb)[i] =
-            ((const uint64xn_t *)bb)[i] + ((const uint64xn_t *)(&b[4]))[i];
-    }
-    /*
-     * for (int i=0; i<4; i++) { aa[i] = a[i] + a[i+4]; bb[i] = b[i] + b[i+4];
-     * }
-     */
-
-    accum2 = widemul(&a[0], &b[3]);
-    accum0 = widemul(&aa[0], &bb[3]);
-    accum1 = widemul(&a[4], &b[7]);
-
-    mac(&accum2, &a[1], &b[2]);
-    mac(&accum0, &aa[1], &bb[2]);
-    mac(&accum1, &a[5], &b[6]);
-
-    mac(&accum2, &a[2], &b[1]);
-    mac(&accum0, &aa[2], &bb[1]);
-    mac(&accum1, &a[6], &b[5]);
-
-    mac(&accum2, &a[3], &b[0]);
-    mac(&accum0, &aa[3], &bb[0]);
-    mac(&accum1, &a[7], &b[4]);
-
-    accum0 -= accum2;
-    accum1 += accum2;
-
-    c[3] = ((uint64_t)(accum1)) & mask;
-    c[7] = ((uint64_t)(accum0)) & mask;
-
-    accum0 >>= 56;
-    accum1 >>= 56;
-
-    mac(&accum0, &aa[1], &bb[3]);
-    mac(&accum1, &a[5], &b[7]);
-    mac(&accum0, &aa[2], &bb[2]);
-    mac(&accum1, &a[6], &b[6]);
-    mac(&accum0, &aa[3], &bb[1]);
-    accum1 += accum0;
-
-    accum2 = widemul(&a[0], &b[0]);
-    accum1 -= accum2;
-    accum0 += accum2;
-
-    msb(&accum0, &a[1], &b[3]);
-    msb(&accum0, &a[2], &b[2]);
-    mac(&accum1, &a[7], &b[5]);
-    msb(&accum0, &a[3], &b[1]);
-    mac(&accum1, &aa[0], &bb[0]);
-    mac(&accum0, &a[4], &b[4]);
-
-    c[0] = ((uint64_t)(accum0)) & mask;
-    c[4] = ((uint64_t)(accum1)) & mask;
-
-    accum0 >>= 56;
-    accum1 >>= 56;
-
-    accum2 = widemul(&a[2], &b[7]);
-    mac(&accum0, &a[6], &bb[3]);
-    mac(&accum1, &aa[2], &bbb[3]);
-
-    mac(&accum2, &a[3], &b[6]);
-    mac(&accum0, &a[7], &bb[2]);
-    mac(&accum1, &aa[3], &bbb[2]);
-
-    mac(&accum2, &a[0], &b[1]);
-    mac(&accum1, &aa[0], &bb[1]);
-    mac(&accum0, &a[4], &b[5]);
-
-    mac(&accum2, &a[1], &b[0]);
-    mac(&accum1, &aa[1], &bb[0]);
-    mac(&accum0, &a[5], &b[4]);
-
-    accum1 -= accum2;
-    accum0 += accum2;
-
-    c[1] = ((uint64_t)(accum0)) & mask;
-    c[5] = ((uint64_t)(accum1)) & mask;
-
-    accum0 >>= 56;
-    accum1 >>= 56;
-
-    accum2 = widemul(&a[3], &b[7]);
-    mac(&accum0, &a[7], &bb[3]);
-    mac(&accum1, &aa[3], &bbb[3]);
-
-    mac(&accum2, &a[0], &b[2]);
-    mac(&accum1, &aa[0], &bb[2]);
-    mac(&accum0, &a[4], &b[6]);
-
-    mac(&accum2, &a[1], &b[1]);
-    mac(&accum1, &aa[1], &bb[1]);
-    mac(&accum0, &a[5], &b[5]);
-
-    mac(&accum2, &a[2], &b[0]);
-    mac(&accum1, &aa[2], &bb[0]);
-    mac(&accum0, &a[6], &b[4]);
-
-    accum1 -= accum2;
-    accum0 += accum2;
-
-    c[2] = ((uint64_t)(accum0)) & mask;
-    c[6] = ((uint64_t)(accum1)) & mask;
-
-    accum0 >>= 56;
-    accum1 >>= 56;
-
-    accum0 += c[3];
-    accum1 += c[7];
-    c[3] = ((uint64_t)(accum0)) & mask;
-    c[7] = ((uint64_t)(accum1)) & mask;
-
-    /* we could almost stop here, but it wouldn't be stable, so... */
-
-    accum0 >>= 56;
-    accum1 >>= 56;
-    c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
-    c[0] += ((uint64_t)(accum1));
-}
-
-void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
-{
-    const uint64_t *a = as->limb;
-    uint64_t *c = cs->limb;
-
-    __uint128_t accum0, accum4;
-    uint64_t mask = (1ull << 56) - 1;
-
-    accum0 = widemul_rm(b, &a[0]);
-    accum4 = widemul_rm(b, &a[4]);
-
-    c[0] = accum0 & mask;
-    accum0 >>= 56;
-    c[4] = accum4 & mask;
-    accum4 >>= 56;
-
-    mac_rm(&accum0, b, &a[1]);
-    mac_rm(&accum4, b, &a[5]);
-
-    c[1] = accum0 & mask;
-    accum0 >>= 56;
-    c[5] = accum4 & mask;
-    accum4 >>= 56;
-
-    mac_rm(&accum0, b, &a[2]);
-    mac_rm(&accum4, b, &a[6]);
-
-    c[2] = accum0 & mask;
-    accum0 >>= 56;
-    c[6] = accum4 & mask;
-    accum4 >>= 56;
-
-    mac_rm(&accum0, b, &a[3]);
-    mac_rm(&accum4, b, &a[7]);
-
-    c[3] = accum0 & mask;
-    accum0 >>= 56;
-    c[7] = accum4 & mask;
-    accum4 >>= 56;
-
-    accum0 += accum4 + c[4];
-    c[4] = accum0 & mask;
-    c[5] += accum0 >> 56;
-
-    accum4 += c[0];
-    c[0] = accum4 & mask;
-    c[1] += accum4 >> 56;
-}
-
-void gf_sqr(gf_s * __restrict__ cs, const gf as)
-{
-    const uint64_t *a = as->limb;
-    uint64_t *c = cs->limb;
-    __uint128_t accum0 = 0, accum1 = 0, accum2;
-    uint64_t mask = (1ull << 56) - 1;
-    uint64_t aa[4] VECTOR_ALIGNED;
-
-    /* For some reason clang doesn't vectorize this without prompting? */
-    unsigned int i;
-    for (i = 0; i < sizeof(aa) / sizeof(uint64xn_t); i++) {
-        ((uint64xn_t *) aa)[i] =
-            ((const uint64xn_t *)a)[i] + ((const uint64xn_t *)(&a[4]))[i];
-    }
-
-    accum2 = widemul(&a[0], &a[3]);
-    accum0 = widemul(&aa[0], &aa[3]);
-    accum1 = widemul(&a[4], &a[7]);
-
-    mac(&accum2, &a[1], &a[2]);
-    mac(&accum0, &aa[1], &aa[2]);
-    mac(&accum1, &a[5], &a[6]);
-
-    accum0 -= accum2;
-    accum1 += accum2;
-
-    c[3] = ((uint64_t)(accum1)) << 1 & mask;
-    c[7] = ((uint64_t)(accum0)) << 1 & mask;
-
-    accum0 >>= 55;
-    accum1 >>= 55;
-
-    mac2(&accum0, &aa[1], &aa[3]);
-    mac2(&accum1, &a[5], &a[7]);
-    mac(&accum0, &aa[2], &aa[2]);
-    accum1 += accum0;
-
-    msb2(&accum0, &a[1], &a[3]);
-    mac(&accum1, &a[6], &a[6]);
-
-    accum2 = widemul(&a[0], &a[0]);
-    accum1 -= accum2;
-    accum0 += accum2;
-
-    msb(&accum0, &a[2], &a[2]);
-    mac(&accum1, &aa[0], &aa[0]);
-    mac(&accum0, &a[4], &a[4]);
-
-    c[0] = ((uint64_t)(accum0)) & mask;
-    c[4] = ((uint64_t)(accum1)) & mask;
-
-    accum0 >>= 56;
-    accum1 >>= 56;
-
-    accum2 = widemul2(&aa[2], &aa[3]);
-    msb2(&accum0, &a[2], &a[3]);
-    mac2(&accum1, &a[6], &a[7]);
-
-    accum1 += accum2;
-    accum0 += accum2;
-
-    accum2 = widemul2(&a[0], &a[1]);
-    mac2(&accum1, &aa[0], &aa[1]);
-    mac2(&accum0, &a[4], &a[5]);
-
-    accum1 -= accum2;
-    accum0 += accum2;
-
-    c[1] = ((uint64_t)(accum0)) & mask;
-    c[5] = ((uint64_t)(accum1)) & mask;
-
-    accum0 >>= 56;
-    accum1 >>= 56;
-
-    accum2 = widemul(&aa[3], &aa[3]);
-    msb(&accum0, &a[3], &a[3]);
-    mac(&accum1, &a[7], &a[7]);
-
-    accum1 += accum2;
-    accum0 += accum2;
-
-    accum2 = widemul2(&a[0], &a[2]);
-    mac2(&accum1, &aa[0], &aa[2]);
-    mac2(&accum0, &a[4], &a[6]);
-
-    mac(&accum2, &a[1], &a[1]);
-    mac(&accum1, &aa[1], &aa[1]);
-    mac(&accum0, &a[5], &a[5]);
-
-    accum1 -= accum2;
-    accum0 += accum2;
-
-    c[2] = ((uint64_t)(accum0)) & mask;
-    c[6] = ((uint64_t)(accum1)) & mask;
-
-    accum0 >>= 56;
-    accum1 >>= 56;
-
-    accum0 += c[3];
-    accum1 += c[7];
-    c[3] = ((uint64_t)(accum0)) & mask;
-    c[7] = ((uint64_t)(accum1)) & mask;
-
-    /* we could almost stop here, but it wouldn't be stable, so... */
-
-    accum0 >>= 56;
-    accum1 >>= 56;
-    c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
-    c[0] += ((uint64_t)(accum1));
-}
--- a/crypto/ec/curve448/arch_x86_64/f_impl.h
+++ b/crypto/ec/curve448/arch_x86_64/f_impl.h
-/*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
- * Copyright 2014-2016 Cryptography Research, Inc.
- *
- * Licensed under the OpenSSL license (the "License").  You may not use
- * this file except in compliance with the License.  You can obtain a copy
- * in the file LICENSE in the source distribution or at
- * https://www.openssl.org/source/license.html
- *
- * Originally written by Mike Hamburg
- */
-
-#define GF_HEADROOM 60
-#define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}
-#define LIMB_PLACE_VALUE(i) 56
-
-void gf_add_RAW(gf out, const gf a, const gf b)
-{
-    for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint64xn_t); i++) {
-        ((uint64xn_t *) out)[i] =
-            ((const uint64xn_t *)a)[i] + ((const uint64xn_t *)b)[i];
-    }
-}
-
-void gf_sub_RAW(gf out, const gf a, const gf b)
-{
-    for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint64xn_t); i++) {
-        ((uint64xn_t *) out)[i] =
-            ((const uint64xn_t *)a)[i] - ((const uint64xn_t *)b)[i];
-    }
-}
-
-void gf_bias(gf a, int amt)
-{
-    uint64_t co1 = ((1ull << 56) - 1) * amt, co2 = co1 - amt;
-
-#if __AVX2__
-    uint64x4_t lo = { co1, co1, co1, co1 }, hi = {
-    co2, co1, co1, co1};
-    uint64x4_t *aa = (uint64x4_t *) a;
-    aa[0] += lo;
-    aa[1] += hi;
-#elif __SSE2__
-    uint64x2_t lo = { co1, co1 }, hi = {
-    co2, co1};
-    uint64x2_t *aa = (uint64x2_t *) a;
-    aa[0] += lo;
-    aa[1] += lo;
-    aa[2] += hi;
-    aa[3] += lo;
-#else
-    for (unsigned int i = 0; i < sizeof(*a) / sizeof(uint64_t); i++) {
-        a->limb[i] += (i == 4) ? co2 : co1;
-    }
-#endif
-}
-
-void gf_weak_reduce(gf a)
-{
-    /* PERF: use pshufb/palignr if anyone cares about speed of this */
-    uint64_t mask = (1ull << 56) - 1;
-    uint64_t tmp = a->limb[7] >> 56;
-
-    a->limb[4] += tmp;
-    for (unsigned int i = 7; i > 0; i--) {
-        a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 56);
-    }
-    a->limb[0] = (a->limb[0] & mask) + tmp;
-}