提交 7e492f33 编写于 作者: M Matt Caswell

Remove curve448 architecture specific files

Remove all architecture specific files except for the reference arch_32
version. These files provide archicture specific performance optimisation.
However they have not been integrated yet. In order to avoid review issues
they are removed for now. They may be reintroduced at a later time.
Reviewed-by: NBernd Edlinger <bernd.edlinger@hotmail.de>
(Merged from https://github.com/openssl/openssl/pull/5105)
上级 0cdcdacc
/*
* Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
* Copyright 2016 Cryptography Research, Inc.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*
* Originally written by Mike Hamburg
*/
#ifndef __ARCH_ARM_32_ARCH_INTRINSICS_H__
# define __ARCH_ARM_32_ARCH_INTRINSICS_H__
# define ARCH_WORD_BITS 32
static __inline__ __attribute((always_inline, unused))
uint32_t word_is_zero(uint32_t a)
{
uint32_t ret;
asm("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc");
return ret;
}
static __inline__ __attribute((always_inline, unused))
uint64_t widemul(uint32_t a, uint32_t b)
{
/*
* Could be UMULL, but it's hard to express to CC that the registers must
* be different
*/
return ((uint64_t)a) * b;
}
#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
此差异已折叠。
/*
* Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
* Copyright 2014-2016 Cryptography Research, Inc.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*
* Originally written by Mike Hamburg
*/
#define GF_HEADROOM 2
#define LIMB(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
#define FIELD_LITERAL(a,b,c,d,e,f,g,h) \
{{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e),LIMB(f),LIMB(g),LIMB(h)}}
#define LIMB_PLACE_VALUE(i) 28
void gf_add_RAW(gf out, const gf a, const gf b)
{
for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
((uint32xn_t *) out)[i] =
((const uint32xn_t *)a)[i] + ((const uint32xn_t *)b)[i];
}
}
void gf_sub_RAW(gf out, const gf a, const gf b)
{
for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
((uint32xn_t *) out)[i] =
((const uint32xn_t *)a)[i] - ((const uint32xn_t *)b)[i];
}
}
void gf_bias(gf a, int amt)
{
uint32_t co1 = ((1ull << 28) - 1) * amt, co2 = co1 - amt;
uint32x4_t lo = { co1, co1, co1, co1 }, hi = {
co2, co1, co1, co1};
uint32x4_t *aa = (uint32x4_t *) a;
aa[0] += lo;
aa[1] += lo;
aa[2] += hi;
aa[3] += lo;
}
void gf_weak_reduce(gf a)
{
uint64_t mask = (1ull << 28) - 1;
uint64_t tmp = a->limb[15] >> 28;
a->limb[8] += tmp;
for (unsigned int i = 15; i > 0; i--) {
a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 28);
}
a->limb[0] = (a->limb[0] & mask) + tmp;
}
/*
* Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
* Copyright 2016 Cryptography Research, Inc.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*
* Originally written by Mike Hamburg
*/
#ifndef __ARCH_NEON_ARCH_INTRINSICS_H__
# define __ARCH_NEON_ARCH_INTRINSICS_H__
# define ARCH_WORD_BITS 32
static __inline__ __attribute((always_inline, unused))
uint32_t word_is_zero(uint32_t a)
{
uint32_t ret;
__asm__("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc");
return ret;
}
static __inline__ __attribute((always_inline, unused))
uint64_t widemul(uint32_t a, uint32_t b)
{
/*
* Could be UMULL, but it's hard to express to CC that the registers must
* be different
*/
return ((uint64_t)a) * b;
}
#endif /* __ARCH_NEON_ARCH_INTRINSICS_H__ */
此差异已折叠。
/*
* Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
* Copyright 2014-2016 Cryptography Research, Inc.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*
* Originally written by Mike Hamburg
*/
#define GF_HEADROOM 2
#define LIMBPERM(x) (((x)<<1 | (x)>>3) & 15)
#define USE_NEON_PERM 1
#define LIMBHI(x) ((x##ull)>>28)
#define LIMBLO(x) ((x##ull)&((1ull<<28)-1))
#define FIELD_LITERAL(a,b,c,d,e,f,g,h) { \
{ \
LIMBLO(a), LIMBLO(e), LIMBHI(a), LIMBHI(e), LIMBLO(b), LIMBLO(f), \
LIMBHI(b), LIMBHI(f), LIMBLO(c), LIMBLO(g), LIMBHI(c), LIMBHI(g), \
LIMBLO(d), LIMBLO(h), LIMBHI(d), LIMBHI(h) \
} \
}
#define LIMB_PLACE_VALUE(i) 28
void gf_add_RAW(gf out, const gf a, const gf b)
{
for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
((uint32xn_t *) out)[i] =
((const uint32xn_t *)a)[i] + ((const uint32xn_t *)b)[i];
}
}
void gf_sub_RAW(gf out, const gf a, const gf b)
{
for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
((uint32xn_t *) out)[i] =
((const uint32xn_t *)a)[i] - ((const uint32xn_t *)b)[i];
}
}
void gf_bias(gf a, int amt)
{
uint32_t co1 = ((1ull << 28) - 1) * amt, co2 = co1 - amt;
uint32x4_t lo = { co1, co2, co1, co1 };
uint32x4_t hi = { co1, co1, co1, co1 };
uint32x4_t *aa = (uint32x4_t *) a;
aa[0] += lo;
aa[1] += hi;
aa[2] += hi;
aa[3] += hi;
}
void gf_weak_reduce(gf a)
{
uint32x2_t *aa = (uint32x2_t *) a;
uint32x2_t vmask = { (1ull << 28) - 1, (1ull << 28) - 1};
uint32x2_t vm2 = { 0, -1}, tmp = vshr_n_u32(aa[7], 28);
for (unsigned int i = 7; i >= 1; i--)
aa[i] = vsra_n_u32(aa[i] & vmask, aa[i - 1], 28);
aa[0] = (aa[0] & vmask) + vrev64_u32(tmp) + (tmp & vm2);
}
/*
* Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
* Copyright 2016 Cryptography Research, Inc.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*
* Originally written by Mike Hamburg
*/
#ifndef __ARCH_REF64_ARCH_INTRINSICS_H__
# define __ARCH_REF64_ARCH_INTRINSICS_H__
# define ARCH_WORD_BITS 64
static __inline__ __attribute((always_inline, unused))
uint64_t word_is_zero(uint64_t a)
{
/* let's hope the compiler isn't clever enough to optimize this. */
return (((__uint128_t) a) - 1) >> 64;
}
static __inline__ __attribute((always_inline, unused))
__uint128_t widemul(uint64_t a, uint64_t b)
{
return ((__uint128_t) a) * b;
}
#endif /* ARCH_REF64_ARCH_INTRINSICS_H__ */
/*
* Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
* Copyright 2014 Cryptography Research, Inc.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*
* Originally written by Mike Hamburg
*/
#include "field.h"
void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
{
const uint64_t *a = as->limb, *b = bs->limb;
uint64_t *c = cs->limb;
__uint128_t accum0 = 0, accum1 = 0, accum2;
uint64_t mask = (1ull << 56) - 1;
uint64_t aa[4], bb[4], bbb[4];
unsigned int i;
for (i = 0; i < 4; i++) {
aa[i] = a[i] + a[i + 4];
bb[i] = b[i] + b[i + 4];
bbb[i] = bb[i] + b[i + 4];
}
int I_HATE_UNROLLED_LOOPS = 0;
if (I_HATE_UNROLLED_LOOPS) {
/*
* The compiler probably won't unroll this, so it's like 80% slower.
*/
for (i = 0; i < 4; i++) {
accum2 = 0;
unsigned int j;
for (j = 0; j <= i; j++) {
accum2 += widemul(a[j], b[i - j]);
accum1 += widemul(aa[j], bb[i - j]);
accum0 += widemul(a[j + 4], b[i - j + 4]);
}
for (; j < 4; j++) {
accum2 += widemul(a[j], b[i - j + 8]);
accum1 += widemul(aa[j], bbb[i - j + 4]);
accum0 += widemul(a[j + 4], bb[i - j + 4]);
}
accum1 -= accum2;
accum0 += accum2;
c[i] = ((uint64_t)(accum0)) & mask;
c[i + 4] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
}
} else {
accum2 = widemul(a[0], b[0]);
accum1 += widemul(aa[0], bb[0]);
accum0 += widemul(a[4], b[4]);
accum2 += widemul(a[1], b[7]);
accum1 += widemul(aa[1], bbb[3]);
accum0 += widemul(a[5], bb[3]);
accum2 += widemul(a[2], b[6]);
accum1 += widemul(aa[2], bbb[2]);
accum0 += widemul(a[6], bb[2]);
accum2 += widemul(a[3], b[5]);
accum1 += widemul(aa[3], bbb[1]);
accum0 += widemul(a[7], bb[1]);
accum1 -= accum2;
accum0 += accum2;
c[0] = ((uint64_t)(accum0)) & mask;
c[4] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
accum2 = widemul(a[0], b[1]);
accum1 += widemul(aa[0], bb[1]);
accum0 += widemul(a[4], b[5]);
accum2 += widemul(a[1], b[0]);
accum1 += widemul(aa[1], bb[0]);
accum0 += widemul(a[5], b[4]);
accum2 += widemul(a[2], b[7]);
accum1 += widemul(aa[2], bbb[3]);
accum0 += widemul(a[6], bb[3]);
accum2 += widemul(a[3], b[6]);
accum1 += widemul(aa[3], bbb[2]);
accum0 += widemul(a[7], bb[2]);
accum1 -= accum2;
accum0 += accum2;
c[1] = ((uint64_t)(accum0)) & mask;
c[5] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
accum2 = widemul(a[0], b[2]);
accum1 += widemul(aa[0], bb[2]);
accum0 += widemul(a[4], b[6]);
accum2 += widemul(a[1], b[1]);
accum1 += widemul(aa[1], bb[1]);
accum0 += widemul(a[5], b[5]);
accum2 += widemul(a[2], b[0]);
accum1 += widemul(aa[2], bb[0]);
accum0 += widemul(a[6], b[4]);
accum2 += widemul(a[3], b[7]);
accum1 += widemul(aa[3], bbb[3]);
accum0 += widemul(a[7], bb[3]);
accum1 -= accum2;
accum0 += accum2;
c[2] = ((uint64_t)(accum0)) & mask;
c[6] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
accum2 = widemul(a[0], b[3]);
accum1 += widemul(aa[0], bb[3]);
accum0 += widemul(a[4], b[7]);
accum2 += widemul(a[1], b[2]);
accum1 += widemul(aa[1], bb[2]);
accum0 += widemul(a[5], b[6]);
accum2 += widemul(a[2], b[1]);
accum1 += widemul(aa[2], bb[1]);
accum0 += widemul(a[6], b[5]);
accum2 += widemul(a[3], b[0]);
accum1 += widemul(aa[3], bb[0]);
accum0 += widemul(a[7], b[4]);
accum1 -= accum2;
accum0 += accum2;
c[3] = ((uint64_t)(accum0)) & mask;
c[7] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
} /* !I_HATE_UNROLLED_LOOPS */
accum0 += accum1;
accum0 += c[4];
accum1 += c[0];
c[4] = ((uint64_t)(accum0)) & mask;
c[0] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
c[5] += ((uint64_t)(accum0));
c[1] += ((uint64_t)(accum1));
}
void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
{
const uint64_t *a = as->limb;
uint64_t *c = cs->limb;
__uint128_t accum0 = 0, accum4 = 0;
uint64_t mask = (1ull << 56) - 1;
int i;
for (i = 0; i < 4; i++) {
accum0 += widemul(b, a[i]);
accum4 += widemul(b, a[i + 4]);
c[i] = accum0 & mask;
accum0 >>= 56;
c[i + 4] = accum4 & mask;
accum4 >>= 56;
}
accum0 += accum4 + c[4];
c[4] = accum0 & mask;
c[5] += accum0 >> 56;
accum4 += c[0];
c[0] = accum4 & mask;
c[1] += accum4 >> 56;
}
void gf_sqr(gf_s * __restrict__ cs, const gf as)
{
const uint64_t *a = as->limb;
uint64_t *c = cs->limb;
__uint128_t accum0 = 0, accum1 = 0, accum2;
uint64_t mask = (1ull << 56) - 1;
uint64_t aa[4];
/* For some reason clang doesn't vectorize this without prompting? */
unsigned int i;
for (i = 0; i < 4; i++) {
aa[i] = a[i] + a[i + 4];
}
accum2 = widemul(a[0], a[3]);
accum0 = widemul(aa[0], aa[3]);
accum1 = widemul(a[4], a[7]);
accum2 += widemul(a[1], a[2]);
accum0 += widemul(aa[1], aa[2]);
accum1 += widemul(a[5], a[6]);
accum0 -= accum2;
accum1 += accum2;
c[3] = ((uint64_t)(accum1)) << 1 & mask;
c[7] = ((uint64_t)(accum0)) << 1 & mask;
accum0 >>= 55;
accum1 >>= 55;
accum0 += widemul(2 * aa[1], aa[3]);
accum1 += widemul(2 * a[5], a[7]);
accum0 += widemul(aa[2], aa[2]);
accum1 += accum0;
accum0 -= widemul(2 * a[1], a[3]);
accum1 += widemul(a[6], a[6]);
accum2 = widemul(a[0], a[0]);
accum1 -= accum2;
accum0 += accum2;
accum0 -= widemul(a[2], a[2]);
accum1 += widemul(aa[0], aa[0]);
accum0 += widemul(a[4], a[4]);
c[0] = ((uint64_t)(accum0)) & mask;
c[4] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
accum2 = widemul(2 * aa[2], aa[3]);
accum0 -= widemul(2 * a[2], a[3]);
accum1 += widemul(2 * a[6], a[7]);
accum1 += accum2;
accum0 += accum2;
accum2 = widemul(2 * a[0], a[1]);
accum1 += widemul(2 * aa[0], aa[1]);
accum0 += widemul(2 * a[4], a[5]);
accum1 -= accum2;
accum0 += accum2;
c[1] = ((uint64_t)(accum0)) & mask;
c[5] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
accum2 = widemul(aa[3], aa[3]);
accum0 -= widemul(a[3], a[3]);
accum1 += widemul(a[7], a[7]);
accum1 += accum2;
accum0 += accum2;
accum2 = widemul(2 * a[0], a[2]);
accum1 += widemul(2 * aa[0], aa[2]);
accum0 += widemul(2 * a[4], a[6]);
accum2 += widemul(a[1], a[1]);
accum1 += widemul(aa[1], aa[1]);
accum0 += widemul(a[5], a[5]);
accum1 -= accum2;
accum0 += accum2;
c[2] = ((uint64_t)(accum0)) & mask;
c[6] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
accum0 += c[3];
accum1 += c[7];
c[3] = ((uint64_t)(accum0)) & mask;
c[7] = ((uint64_t)(accum1)) & mask;
/* we could almost stop here, but it wouldn't be stable, so... */
accum0 >>= 56;
accum1 >>= 56;
c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
c[0] += ((uint64_t)(accum1));
}
/*
* Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
* Copyright 2014-2016 Cryptography Research, Inc.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*
* Originally written by Mike Hamburg
*/
#define GF_HEADROOM 9999 /* Everything is reduced anyway */
#define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}
#define LIMB_PLACE_VALUE(i) 56
void gf_add_RAW(gf out, const gf a, const gf b)
{
for (unsigned int i = 0; i < 8; i++)
out->limb[i] = a->limb[i] + b->limb[i];
gf_weak_reduce(out);
}
void gf_sub_RAW(gf out, const gf a, const gf b)
{
uint64_t co1 = ((1ull << 56) - 1) * 2, co2 = co1 - 2;
for (unsigned int i = 0; i < 8; i++)
out->limb[i] = a->limb[i] - b->limb[i] + ((i == 4) ? co2 : co1);
gf_weak_reduce(out);
}
void gf_bias(gf a, int amt)
{
(void)a;
(void)amt;
}
void gf_weak_reduce(gf a)
{
uint64_t mask = (1ull << 56) - 1;
uint64_t tmp = a->limb[7] >> 56;
a->limb[4] += tmp;
for (unsigned int i = 7; i > 0; i--)
a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 56);
a->limb[0] = (a->limb[0] & mask) + tmp;
}
/*
* Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
* Copyright 2014-2016 Cryptography Research, Inc.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*
* Originally written by Mike Hamburg
*/
#ifndef __ARCH_X86_64_ARCH_INTRINSICS_H__
#define __ARCH_X86_64_ARCH_INTRINSICS_H__
#define ARCH_WORD_BITS 64
#include <openssl/e_os2.h>
/* FUTURE: autogenerate */
static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b)
{
uint64_t c, d;
#ifndef __BMI2__
__asm__ volatile
("movq %[a], %%rax;"
"mulq %[b];"
: [c]"=&a"(c), [d]"=d"(d)
: [b]"m"(*b), [a]"m"(*a)
: "cc");
#else
__asm__ volatile
("movq %[a], %%rdx;"
"mulx %[b], %[c], %[d];"
: [c]"=r"(c), [d]"=r"(d)
: [b]"m"(*b), [a]"m"(*a)
: "rdx");
#endif
return (((__uint128_t)(d)) << 64) | c;
}
static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b)
{
uint64_t c, d;
#ifndef __BMI2__
__asm__ volatile
("movq %[a], %%rax;"
"mulq %[b];"
: [c]"=&a"(c), [d]"=d"(d)
: [b]"m"(*b), [a]"r"(a)
: "cc");
#else
__asm__ volatile
("mulx %[b], %[c], %[d];"
: [c]"=r"(c), [d]"=r"(d)
: [b]"m"(*b), [a]"d"(a));
#endif
return (((__uint128_t)(d)) << 64) | c;
}
static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b)
{
uint64_t c, d;
#ifndef __BMI2__
__asm__ volatile
("mulq %[b];"
: [c]"=a"(c), [d]"=d"(d)
: [b]"r"(b), "a"(a)
: "cc");
#else
__asm__ volatile
("mulx %[b], %[c], %[d];"
: [c]"=r"(c), [d]"=r"(d)
: [b]"r"(b), [a]"d"(a));
#endif
return (((__uint128_t)(d)) << 64) | c;
}
static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b)
{
uint64_t c, d;
#ifndef __BMI2__
__asm__ volatile
("movq %[a], %%rax; "
"addq %%rax, %%rax; "
"mulq %[b];"
: [c]"=&a"(c), [d]"=d"(d)
: [b]"m"(*b), [a]"m"(*a)
: "cc");
#else
__asm__ volatile
("movq %[a], %%rdx;"
"leaq (,%%rdx,2), %%rdx;"
"mulx %[b], %[c], %[d];"
: [c]"=r"(c), [d]"=r"(d)
: [b]"m"(*b), [a]"m"(*a)
: "rdx");
#endif
return (((__uint128_t)(d)) << 64) | c;
}
static __inline__ void mac(__uint128_t *acc, const uint64_t *a,
const uint64_t *b)
{
uint64_t lo = *acc, hi = *acc >> 64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx; "
"mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; "
"adcq %[d], %[hi]; "
: [c]"=&r"(c), [d]"=&r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rdx", "cc");
#else
__asm__ volatile
("movq %[a], %%rax; "
"mulq %[b]; "
"addq %%rax, %[lo]; "
"adcq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi)) << 64) | lo;
}
static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2,
const uint64_t *a, const uint64_t *b)
{
uint64_t lo = *acc, hi = *acc >> 64;
uint64_t lo2 = *acc2, hi2 = *acc2 >> 64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx; "
"mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; "
"adcq %[d], %[hi]; "
"addq %[c], %[lo2]; "
"adcq %[d], %[hi2]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
: [b]"m"(*b), [a]"m"(*a)
: "rdx", "cc");
#else
__asm__ volatile
("movq %[a], %%rax; "
"mulq %[b]; "
"addq %%rax, %[lo]; "
"adcq %%rdx, %[hi]; "
"addq %%rax, %[lo2]; "
"adcq %%rdx, %[hi2]; "
: [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
: [b]"m"(*b), [a]"m"(*a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi)) << 64) | lo;
*acc2 = (((__uint128_t)(hi2)) << 64) | lo2;
}
static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b)
{
uint64_t lo = *acc, hi = *acc >> 64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; "
"adcq %[d], %[hi]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"d"(a)
: "cc");
#else
__asm__ volatile
("movq %[a], %%rax; "
"mulq %[b]; "
"addq %%rax, %[lo]; "
"adcq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"r"(a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi)) << 64) | lo;
}
static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b)
{
uint64_t lo = *acc, hi = *acc >> 64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; "
"adcq %[d], %[hi]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"r"(b), [a]"d"(a)
: "cc");
#else
__asm__ volatile
("mulq %[b]; "
"addq %%rax, %[lo]; "
"adcq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi), "+a"(a)
: [b]"r"(b)
: "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi)) << 64) | lo;
}
static __inline__ void mac2(__uint128_t *acc, const uint64_t *a,
const uint64_t *b)
{
uint64_t lo = *acc, hi = *acc >> 64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx; "
"addq %%rdx, %%rdx; "
"mulx %[b], %[c], %[d]; "
"addq %[c], %[lo]; "
"adcq %[d], %[hi]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rdx", "cc");
#else
__asm__ volatile
("movq %[a], %%rax; "
"addq %%rax, %%rax; "
"mulq %[b]; "
"addq %%rax, %[lo]; "
"adcq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi)) << 64) | lo;
}
static __inline__ void msb(__uint128_t *acc, const uint64_t *a,
const uint64_t *b)
{
uint64_t lo = *acc, hi = *acc >> 64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx; "
"mulx %[b], %[c], %[d]; "
"subq %[c], %[lo]; "
"sbbq %[d], %[hi]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rdx", "cc");
#else
__asm__ volatile
("movq %[a], %%rax; "
"mulq %[b]; "
"subq %%rax, %[lo]; "
"sbbq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi)) << 64) | lo;
}
static __inline__ void msb2(__uint128_t *acc, const uint64_t *a,
const uint64_t *b)
{
uint64_t lo = *acc, hi = *acc >> 64;
#ifdef __BMI2__
uint64_t c,d;
__asm__ volatile
("movq %[a], %%rdx; "
"addq %%rdx, %%rdx; "
"mulx %[b], %[c], %[d]; "
"subq %[c], %[lo]; "
"sbbq %[d], %[hi]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rdx", "cc");
#else
__asm__ volatile
("movq %[a], %%rax; "
"addq %%rax, %%rax; "
"mulq %[b]; "
"subq %%rax, %[lo]; "
"sbbq %%rdx, %[hi]; "
: [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rax", "rdx", "cc");
#endif
*acc = (((__uint128_t)(hi))<<64) | lo;
}
static __inline__ void mrs(__uint128_t *acc, const uint64_t *a,
const uint64_t *b)
{
uint64_t c,d, lo = *acc, hi = *acc >> 64;
__asm__ volatile
("movq %[a], %%rdx; "
"mulx %[b], %[c], %[d]; "
"subq %[lo], %[c]; "
"sbbq %[hi], %[d]; "
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
: [b]"m"(*b), [a]"m"(*a)
: "rdx", "cc");
*acc = (((__uint128_t)(d)) << 64) | c;
}
static __inline__ uint64_t word_is_zero(uint64_t x)
{
__asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
return ~x;
}
static inline uint64_t shrld(__uint128_t x, int n)
{
return x >> n;
}
#endif /* __ARCH_X86_64_ARCH_INTRINSICS_H__ */
/*
* Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
* Copyright 2014 Cryptography Research, Inc.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*
* Originally written by Mike Hamburg
*/
#include "field.h"
void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
{
const uint64_t *a = as->limb, *b = bs->limb;
uint64_t *c = cs->limb;
__uint128_t accum0 = 0, accum1 = 0, accum2;
uint64_t mask = (1ull << 56) - 1;
uint64_t aa[4] VECTOR_ALIGNED, bb[4] VECTOR_ALIGNED, bbb[4] VECTOR_ALIGNED;
/* For some reason clang doesn't vectorize this without prompting? */
unsigned int i;
for (i = 0; i < sizeof(aa) / sizeof(uint64xn_t); i++) {
((uint64xn_t *) aa)[i] =
((const uint64xn_t *)a)[i] + ((const uint64xn_t *)(&a[4]))[i];
((uint64xn_t *) bb)[i] =
((const uint64xn_t *)b)[i] + ((const uint64xn_t *)(&b[4]))[i];
((uint64xn_t *) bbb)[i] =
((const uint64xn_t *)bb)[i] + ((const uint64xn_t *)(&b[4]))[i];
}
/*
* for (int i=0; i<4; i++) { aa[i] = a[i] + a[i+4]; bb[i] = b[i] + b[i+4];
* }
*/
accum2 = widemul(&a[0], &b[3]);
accum0 = widemul(&aa[0], &bb[3]);
accum1 = widemul(&a[4], &b[7]);
mac(&accum2, &a[1], &b[2]);
mac(&accum0, &aa[1], &bb[2]);
mac(&accum1, &a[5], &b[6]);
mac(&accum2, &a[2], &b[1]);
mac(&accum0, &aa[2], &bb[1]);
mac(&accum1, &a[6], &b[5]);
mac(&accum2, &a[3], &b[0]);
mac(&accum0, &aa[3], &bb[0]);
mac(&accum1, &a[7], &b[4]);
accum0 -= accum2;
accum1 += accum2;
c[3] = ((uint64_t)(accum1)) & mask;
c[7] = ((uint64_t)(accum0)) & mask;
accum0 >>= 56;
accum1 >>= 56;
mac(&accum0, &aa[1], &bb[3]);
mac(&accum1, &a[5], &b[7]);
mac(&accum0, &aa[2], &bb[2]);
mac(&accum1, &a[6], &b[6]);
mac(&accum0, &aa[3], &bb[1]);
accum1 += accum0;
accum2 = widemul(&a[0], &b[0]);
accum1 -= accum2;
accum0 += accum2;
msb(&accum0, &a[1], &b[3]);
msb(&accum0, &a[2], &b[2]);
mac(&accum1, &a[7], &b[5]);
msb(&accum0, &a[3], &b[1]);
mac(&accum1, &aa[0], &bb[0]);
mac(&accum0, &a[4], &b[4]);
c[0] = ((uint64_t)(accum0)) & mask;
c[4] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
accum2 = widemul(&a[2], &b[7]);
mac(&accum0, &a[6], &bb[3]);
mac(&accum1, &aa[2], &bbb[3]);
mac(&accum2, &a[3], &b[6]);
mac(&accum0, &a[7], &bb[2]);
mac(&accum1, &aa[3], &bbb[2]);
mac(&accum2, &a[0], &b[1]);
mac(&accum1, &aa[0], &bb[1]);
mac(&accum0, &a[4], &b[5]);
mac(&accum2, &a[1], &b[0]);
mac(&accum1, &aa[1], &bb[0]);
mac(&accum0, &a[5], &b[4]);
accum1 -= accum2;
accum0 += accum2;
c[1] = ((uint64_t)(accum0)) & mask;
c[5] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
accum2 = widemul(&a[3], &b[7]);
mac(&accum0, &a[7], &bb[3]);
mac(&accum1, &aa[3], &bbb[3]);
mac(&accum2, &a[0], &b[2]);
mac(&accum1, &aa[0], &bb[2]);
mac(&accum0, &a[4], &b[6]);
mac(&accum2, &a[1], &b[1]);
mac(&accum1, &aa[1], &bb[1]);
mac(&accum0, &a[5], &b[5]);
mac(&accum2, &a[2], &b[0]);
mac(&accum1, &aa[2], &bb[0]);
mac(&accum0, &a[6], &b[4]);
accum1 -= accum2;
accum0 += accum2;
c[2] = ((uint64_t)(accum0)) & mask;
c[6] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
accum0 += c[3];
accum1 += c[7];
c[3] = ((uint64_t)(accum0)) & mask;
c[7] = ((uint64_t)(accum1)) & mask;
/* we could almost stop here, but it wouldn't be stable, so... */
accum0 >>= 56;
accum1 >>= 56;
c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
c[0] += ((uint64_t)(accum1));
}
void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
{
const uint64_t *a = as->limb;
uint64_t *c = cs->limb;
__uint128_t accum0, accum4;
uint64_t mask = (1ull << 56) - 1;
accum0 = widemul_rm(b, &a[0]);
accum4 = widemul_rm(b, &a[4]);
c[0] = accum0 & mask;
accum0 >>= 56;
c[4] = accum4 & mask;
accum4 >>= 56;
mac_rm(&accum0, b, &a[1]);
mac_rm(&accum4, b, &a[5]);
c[1] = accum0 & mask;
accum0 >>= 56;
c[5] = accum4 & mask;
accum4 >>= 56;
mac_rm(&accum0, b, &a[2]);
mac_rm(&accum4, b, &a[6]);
c[2] = accum0 & mask;
accum0 >>= 56;
c[6] = accum4 & mask;
accum4 >>= 56;
mac_rm(&accum0, b, &a[3]);
mac_rm(&accum4, b, &a[7]);
c[3] = accum0 & mask;
accum0 >>= 56;
c[7] = accum4 & mask;
accum4 >>= 56;
accum0 += accum4 + c[4];
c[4] = accum0 & mask;
c[5] += accum0 >> 56;
accum4 += c[0];
c[0] = accum4 & mask;
c[1] += accum4 >> 56;
}
void gf_sqr(gf_s * __restrict__ cs, const gf as)
{
const uint64_t *a = as->limb;
uint64_t *c = cs->limb;
__uint128_t accum0 = 0, accum1 = 0, accum2;
uint64_t mask = (1ull << 56) - 1;
uint64_t aa[4] VECTOR_ALIGNED;
/* For some reason clang doesn't vectorize this without prompting? */
unsigned int i;
for (i = 0; i < sizeof(aa) / sizeof(uint64xn_t); i++) {
((uint64xn_t *) aa)[i] =
((const uint64xn_t *)a)[i] + ((const uint64xn_t *)(&a[4]))[i];
}
accum2 = widemul(&a[0], &a[3]);
accum0 = widemul(&aa[0], &aa[3]);
accum1 = widemul(&a[4], &a[7]);
mac(&accum2, &a[1], &a[2]);
mac(&accum0, &aa[1], &aa[2]);
mac(&accum1, &a[5], &a[6]);
accum0 -= accum2;
accum1 += accum2;
c[3] = ((uint64_t)(accum1)) << 1 & mask;
c[7] = ((uint64_t)(accum0)) << 1 & mask;
accum0 >>= 55;
accum1 >>= 55;
mac2(&accum0, &aa[1], &aa[3]);
mac2(&accum1, &a[5], &a[7]);
mac(&accum0, &aa[2], &aa[2]);
accum1 += accum0;
msb2(&accum0, &a[1], &a[3]);
mac(&accum1, &a[6], &a[6]);
accum2 = widemul(&a[0], &a[0]);
accum1 -= accum2;
accum0 += accum2;
msb(&accum0, &a[2], &a[2]);
mac(&accum1, &aa[0], &aa[0]);
mac(&accum0, &a[4], &a[4]);
c[0] = ((uint64_t)(accum0)) & mask;
c[4] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
accum2 = widemul2(&aa[2], &aa[3]);
msb2(&accum0, &a[2], &a[3]);
mac2(&accum1, &a[6], &a[7]);
accum1 += accum2;
accum0 += accum2;
accum2 = widemul2(&a[0], &a[1]);
mac2(&accum1, &aa[0], &aa[1]);
mac2(&accum0, &a[4], &a[5]);
accum1 -= accum2;
accum0 += accum2;
c[1] = ((uint64_t)(accum0)) & mask;
c[5] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
accum2 = widemul(&aa[3], &aa[3]);
msb(&accum0, &a[3], &a[3]);
mac(&accum1, &a[7], &a[7]);
accum1 += accum2;
accum0 += accum2;
accum2 = widemul2(&a[0], &a[2]);
mac2(&accum1, &aa[0], &aa[2]);
mac2(&accum0, &a[4], &a[6]);
mac(&accum2, &a[1], &a[1]);
mac(&accum1, &aa[1], &aa[1]);
mac(&accum0, &a[5], &a[5]);
accum1 -= accum2;
accum0 += accum2;
c[2] = ((uint64_t)(accum0)) & mask;
c[6] = ((uint64_t)(accum1)) & mask;
accum0 >>= 56;
accum1 >>= 56;
accum0 += c[3];
accum1 += c[7];
c[3] = ((uint64_t)(accum0)) & mask;
c[7] = ((uint64_t)(accum1)) & mask;
/* we could almost stop here, but it wouldn't be stable, so... */
accum0 >>= 56;
accum1 >>= 56;
c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
c[0] += ((uint64_t)(accum1));
}
/*
* Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
* Copyright 2014-2016 Cryptography Research, Inc.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*
* Originally written by Mike Hamburg
*/
#define GF_HEADROOM 60
#define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}
#define LIMB_PLACE_VALUE(i) 56
void gf_add_RAW(gf out, const gf a, const gf b)
{
for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint64xn_t); i++) {
((uint64xn_t *) out)[i] =
((const uint64xn_t *)a)[i] + ((const uint64xn_t *)b)[i];
}
}
void gf_sub_RAW(gf out, const gf a, const gf b)
{
for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint64xn_t); i++) {
((uint64xn_t *) out)[i] =
((const uint64xn_t *)a)[i] - ((const uint64xn_t *)b)[i];
}
}
void gf_bias(gf a, int amt)
{
uint64_t co1 = ((1ull << 56) - 1) * amt, co2 = co1 - amt;
#if __AVX2__
uint64x4_t lo = { co1, co1, co1, co1 }, hi = {
co2, co1, co1, co1};
uint64x4_t *aa = (uint64x4_t *) a;
aa[0] += lo;
aa[1] += hi;
#elif __SSE2__
uint64x2_t lo = { co1, co1 }, hi = {
co2, co1};
uint64x2_t *aa = (uint64x2_t *) a;
aa[0] += lo;
aa[1] += lo;
aa[2] += hi;
aa[3] += lo;
#else
for (unsigned int i = 0; i < sizeof(*a) / sizeof(uint64_t); i++) {
a->limb[i] += (i == 4) ? co2 : co1;
}
#endif
}
void gf_weak_reduce(gf a)
{
/* PERF: use pshufb/palignr if anyone cares about speed of this */
uint64_t mask = (1ull << 56) - 1;
uint64_t tmp = a->limb[7] >> 56;
a->limb[4] += tmp;
for (unsigned int i = 7; i > 0; i--) {
a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 56);
}
a->limb[0] = (a->limb[0] & mask) + tmp;
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册