提交 eb791696 编写于 作者: A Andy Polyakov

ec/ecp_nistz256.c: improve ECDSA sign by 30-40%.

This is based on RT#3810, which added dedicated modular inversion.
ECDSA verify results improves as well, but not as much.
Reviewed-by: NRich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/5001)
上级 617b49db
#! /usr/bin/env perl
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
# Copyright (c) 2014, Intel Corporation. All Rights Reserved.
# Copyright (c) 2015 CloudFlare, Inc.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
# (1) Intel Corporation, Israel Development Center, Haifa, Israel
# (2) University of Haifa, Israel
# (3) CloudFlare, Inc.
#
# Reference:
# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
......@@ -18,23 +20,25 @@
# Further optimization by <appro@openssl.org>:
#
# this/original with/without -DECP_NISTZ256_ASM(*)
# Opteron +12-49% +110-150%
# Bulldozer +14-45% +175-210%
# P4 +18-46% n/a :-(
# Westmere +12-34% +80-87%
# Sandy Bridge +9-35% +110-120%
# Ivy Bridge +9-35% +110-125%
# Haswell +8-37% +140-160%
# Broadwell +18-58% +145-210%
# Atom +15-50% +130-180%
# VIA Nano +43-160% +300-480%
# Opteron +15-49% +150-195%
# Bulldozer +18-45% +175-240%
# P4 +24-46% +100-150%
# Westmere +18-34% +87-160%
# Sandy Bridge +14-35% +120-185%
# Ivy Bridge +11-35% +125-180%
# Haswell +10-37% +160-200%
# Broadwell +24-58% +210-270%
# Atom +20-50% +180-240%
# VIA Nano +50-160% +480-480%
#
# (*) "without -DECP_NISTZ256_ASM" refers to build with
# "enable-ec_nistp_64_gcc_128";
#
# Ranges denote minimum and maximum improvement coefficients depending
# on benchmark. Lower coefficients are for ECDSA sign, relatively fastest
# server-side operation. Keep in mind that +100% means 2x improvement.
# on benchmark. In "this/original" column lower coefficient is for
# ECDSA sign, while in "with/without" - for ECDH key agreement, and
# higher - for ECDSA sign, relatively fastest server-side operation.
# Keep in mind that +100% means 2x improvement.
$flavour = shift;
$output = shift;
......@@ -95,6 +99,12 @@ $code.=<<___;
.long 3,3,3,3,3,3,3,3
.LONE_mont:
.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
# Constants for computations modulo ord(p256)
.Lord:
.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
.LordK:
.quad 0xccd1c8aaee00bc4f
___
{
......@@ -481,6 +491,1014 @@ my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
my ($poly1,$poly3)=($acc6,$acc7);
$code.=<<___;
################################################################################
# void ecp_nistz256_ord_mul_mont(
# uint64_t res[4],
# uint64_t a[4],
# uint64_t b[4]);
.globl ecp_nistz256_ord_mul_mont
.type ecp_nistz256_ord_mul_mont,\@function,3
.align 32
ecp_nistz256_ord_mul_mont:
___
$code.=<<___ if ($addx);
mov \$0x80100, %ecx
and OPENSSL_ia32cap_P+8(%rip), %ecx
cmp \$0x80100, %ecx
je .Lecp_nistz256_ord_mul_montx
___
$code.=<<___;
push %rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
mov 8*0($b_org), %rax
mov $b_org, $b_ptr
lea .Lord(%rip), %r14
mov .LordK(%rip), %r15
################################# * b[0]
mov %rax, $t0
mulq 8*0($a_ptr)
mov %rax, $acc0
mov $t0, %rax
mov %rdx, $acc1
mulq 8*1($a_ptr)
add %rax, $acc1
mov $t0, %rax
adc \$0, %rdx
mov %rdx, $acc2
mulq 8*2($a_ptr)
add %rax, $acc2
mov $t0, %rax
adc \$0, %rdx
mov $acc0, $acc5
imulq %r15,$acc0
mov %rdx, $acc3
mulq 8*3($a_ptr)
add %rax, $acc3
mov $acc0, %rax
adc \$0, %rdx
mov %rdx, $acc4
################################# First reduction step
mulq 8*0(%r14)
mov $acc0, $t1
add %rax, $acc5 # guaranteed to be zero
mov $acc0, %rax
adc \$0, %rdx
mov %rdx, $t0
sub $acc0, $acc2
sbb \$0, $acc0 # can't borrow
mulq 8*1(%r14)
add $t0, $acc1
adc \$0, %rdx
add %rax, $acc1
mov $t1, %rax
adc %rdx, $acc2
mov $t1, %rdx
adc \$0, $acc0 # can't overflow
shl \$32, %rax
shr \$32, %rdx
sub %rax, $acc3
mov 8*1($b_ptr), %rax
sbb %rdx, $t1 # can't borrow
add $acc0, $acc3
adc $t1, $acc4
adc \$0, $acc5
################################# * b[1]
mov %rax, $t0
mulq 8*0($a_ptr)
add %rax, $acc1
mov $t0, %rax
adc \$0, %rdx
mov %rdx, $t1
mulq 8*1($a_ptr)
add $t1, $acc2
adc \$0, %rdx
add %rax, $acc2
mov $t0, %rax
adc \$0, %rdx
mov %rdx, $t1
mulq 8*2($a_ptr)
add $t1, $acc3
adc \$0, %rdx
add %rax, $acc3
mov $t0, %rax
adc \$0, %rdx
mov $acc1, $t0
imulq %r15, $acc1
mov %rdx, $t1
mulq 8*3($a_ptr)
add $t1, $acc4
adc \$0, %rdx
xor $acc0, $acc0
add %rax, $acc4
mov $acc1, %rax
adc %rdx, $acc5
adc \$0, $acc0
################################# Second reduction step
mulq 8*0(%r14)
mov $acc1, $t1
add %rax, $t0 # guaranteed to be zero
mov $acc1, %rax
adc %rdx, $t0
sub $acc1, $acc3
sbb \$0, $acc1 # can't borrow
mulq 8*1(%r14)
add $t0, $acc2
adc \$0, %rdx
add %rax, $acc2
mov $t1, %rax
adc %rdx, $acc3
mov $t1, %rdx
adc \$0, $acc1 # can't overflow
shl \$32, %rax
shr \$32, %rdx
sub %rax, $acc4
mov 8*2($b_ptr), %rax
sbb %rdx, $t1 # can't borrow
add $acc1, $acc4
adc $t1, $acc5
adc \$0, $acc0
################################## * b[2]
mov %rax, $t0
mulq 8*0($a_ptr)
add %rax, $acc2
mov $t0, %rax
adc \$0, %rdx
mov %rdx, $t1
mulq 8*1($a_ptr)
add $t1, $acc3
adc \$0, %rdx
add %rax, $acc3
mov $t0, %rax
adc \$0, %rdx
mov %rdx, $t1
mulq 8*2($a_ptr)
add $t1, $acc4
adc \$0, %rdx
add %rax, $acc4
mov $t0, %rax
adc \$0, %rdx
mov $acc2, $t0
imulq %r15, $acc2
mov %rdx, $t1
mulq 8*3($a_ptr)
add $t1, $acc5
adc \$0, %rdx
xor $acc1, $acc1
add %rax, $acc5
mov $acc2, %rax
adc %rdx, $acc0
adc \$0, $acc1
################################# Third reduction step
mulq 8*0(%r14)
mov $acc2, $t1
add %rax, $t0 # guaranteed to be zero
mov $acc2, %rax
adc %rdx, $t0
sub $acc2, $acc4
sbb \$0, $acc2 # can't borrow
mulq 8*1(%r14)
add $t0, $acc3
adc \$0, %rdx
add %rax, $acc3
mov $t1, %rax
adc %rdx, $acc4
mov $t1, %rdx
adc \$0, $acc2 # can't overflow
shl \$32, %rax
shr \$32, %rdx
sub %rax, $acc5
mov 8*3($b_ptr), %rax
sbb %rdx, $t1 # can't borrow
add $acc2, $acc5
adc $t1, $acc0
adc \$0, $acc1
################################# * b[3]
mov %rax, $t0
mulq 8*0($a_ptr)
add %rax, $acc3
mov $t0, %rax
adc \$0, %rdx
mov %rdx, $t1
mulq 8*1($a_ptr)
add $t1, $acc4
adc \$0, %rdx
add %rax, $acc4
mov $t0, %rax
adc \$0, %rdx
mov %rdx, $t1
mulq 8*2($a_ptr)
add $t1, $acc5
adc \$0, %rdx
add %rax, $acc5
mov $t0, %rax
adc \$0, %rdx
mov $acc3, $t0
imulq %r15, $acc3
mov %rdx, $t1
mulq 8*3($a_ptr)
add $t1, $acc0
adc \$0, %rdx
xor $acc2, $acc2
add %rax, $acc0
mov $acc3, %rax
adc %rdx, $acc1
adc \$0, $acc2
################################# Last reduction step
mulq 8*0(%r14)
mov $acc3, $t1
add %rax, $t0 # guaranteed to be zero
mov $acc3, %rax
adc %rdx, $t0
sub $acc3, $acc5
sbb \$0, $acc3 # can't borrow
mulq 8*1(%r14)
add $t0, $acc4
adc \$0, %rdx
add %rax, $acc4
mov $t1, %rax
adc %rdx, $acc5
mov $t1, %rdx
adc \$0, $acc3 # can't overflow
shl \$32, %rax
shr \$32, %rdx
sub %rax, $acc0
sbb %rdx, $t1 # can't borrow
add $acc3, $acc0
adc $t1, $acc1
adc \$0, $acc2
################################# Subtract ord
mov $acc4, $a_ptr
sub 8*0(%r14), $acc4
mov $acc5, $acc3
sbb 8*1(%r14), $acc5
mov $acc0, $t0
sbb 8*2(%r14), $acc0
mov $acc1, $t1
sbb 8*3(%r14), $acc1
sbb \$0, $acc2
cmovc $a_ptr, $acc4
cmovc $acc3, $acc5
cmovc $t0, $acc0
cmovc $t1, $acc1
mov $acc4, 8*0($r_ptr)
mov $acc5, 8*1($r_ptr)
mov $acc0, 8*2($r_ptr)
mov $acc1, 8*3($r_ptr)
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbx
pop %rbp
ret
.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
################################################################################
# void ecp_nistz256_ord_sqr_mont(
# uint64_t res[4],
# uint64_t a[4],
# int rep);
.globl ecp_nistz256_ord_sqr_mont
.type ecp_nistz256_ord_sqr_mont,\@function,3
.align 32
ecp_nistz256_ord_sqr_mont:
___
$code.=<<___ if ($addx);
mov \$0x80100, %ecx
and OPENSSL_ia32cap_P+8(%rip), %ecx
cmp \$0x80100, %ecx
je .Lecp_nistz256_ord_sqr_montx
___
$code.=<<___;
push %rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
mov 8*0($a_ptr), $acc0
mov 8*1($a_ptr), %rax
mov 8*2($a_ptr), $acc6
mov 8*3($a_ptr), $acc7
lea .Lord(%rip), $a_ptr # pointer to modulus
mov $b_org, $b_ptr
jmp .Loop_ord_sqr
.align 32
.Loop_ord_sqr:
################################# a[1:] * a[0]
mov %rax, $t1 # put aside a[1]
mul $acc0 # a[1] * a[0]
mov %rax, $acc1
movq $t1, %xmm1 # offload a[1]
mov $acc6, %rax
mov %rdx, $acc2
mul $acc0 # a[2] * a[0]
add %rax, $acc2
mov $acc7, %rax
movq $acc6, %xmm2 # offload a[2]
adc \$0, %rdx
mov %rdx, $acc3
mul $acc0 # a[3] * a[0]
add %rax, $acc3
mov $acc7, %rax
movq $acc7, %xmm3 # offload a[3]
adc \$0, %rdx
mov %rdx, $acc4
################################# a[3] * a[2]
mul $acc6 # a[3] * a[2]
mov %rax, $acc5
mov $acc6, %rax
mov %rdx, $acc6
################################# a[2:] * a[1]
mul $t1 # a[2] * a[1]
add %rax, $acc3
mov $acc7, %rax
adc \$0, %rdx
mov %rdx, $acc7
mul $t1 # a[3] * a[1]
add %rax, $acc4
adc \$0, %rdx
add $acc7, $acc4
adc %rdx, $acc5
adc \$0, $acc6 # can't overflow
################################# *2
xor $acc7, $acc7
mov $acc0, %rax
add $acc1, $acc1
adc $acc2, $acc2
adc $acc3, $acc3
adc $acc4, $acc4
adc $acc5, $acc5
adc $acc6, $acc6
adc \$0, $acc7
################################# Missing products
mul %rax # a[0] * a[0]
mov %rax, $acc0
movq %xmm1, %rax
mov %rdx, $t1
mul %rax # a[1] * a[1]
add $t1, $acc1
adc %rax, $acc2
movq %xmm2, %rax
adc \$0, %rdx
mov %rdx, $t1
mul %rax # a[2] * a[2]
add $t1, $acc3
adc %rax, $acc4
movq %xmm3, %rax
adc \$0, %rdx
mov %rdx, $t1
mov $acc0, $t0
imulq 8*4($a_ptr), $acc0 # *= .LordK
mul %rax # a[3] * a[3]
add $t1, $acc5
adc %rax, $acc6
mov 8*0($a_ptr), %rax # modulus[0]
adc %rdx, $acc7 # can't overflow
################################# First reduction step
mul $acc0
mov $acc0, $t1
add %rax, $t0 # guaranteed to be zero
mov 8*1($a_ptr), %rax # modulus[1]
adc %rdx, $t0
sub $acc0, $acc2
sbb \$0, $t1 # can't borrow
mul $acc0
add $t0, $acc1
adc \$0, %rdx
add %rax, $acc1
mov $acc0, %rax
adc %rdx, $acc2
mov $acc0, %rdx
adc \$0, $t1 # can't overflow
mov $acc1, $t0
imulq 8*4($a_ptr), $acc1 # *= .LordK
shl \$32, %rax
shr \$32, %rdx
sub %rax, $acc3
mov 8*0($a_ptr), %rax
sbb %rdx, $acc0 # can't borrow
add $t1, $acc3
adc \$0, $acc0 # can't overflow
################################# Second reduction step
mul $acc1
mov $acc1, $t1
add %rax, $t0 # guaranteed to be zero
mov 8*1($a_ptr), %rax
adc %rdx, $t0
sub $acc1, $acc3
sbb \$0, $t1 # can't borrow
mul $acc1
add $t0, $acc2
adc \$0, %rdx
add %rax, $acc2
mov $acc1, %rax
adc %rdx, $acc3
mov $acc1, %rdx
adc \$0, $t1 # can't overflow
mov $acc2, $t0
imulq 8*4($a_ptr), $acc2 # *= .LordK
shl \$32, %rax
shr \$32, %rdx
sub %rax, $acc0
mov 8*0($a_ptr), %rax
sbb %rdx, $acc1 # can't borrow
add $t1, $acc0
adc \$0, $acc1 # can't overflow
################################# Third reduction step
mul $acc2
mov $acc2, $t1
add %rax, $t0 # guaranteed to be zero
mov 8*1($a_ptr), %rax
adc %rdx, $t0
sub $acc2, $acc0
sbb \$0, $t1 # can't borrow
mul $acc2
add $t0, $acc3
adc \$0, %rdx
add %rax, $acc3
mov $acc2, %rax
adc %rdx, $acc0
mov $acc2, %rdx
adc \$0, $t1 # can't overflow
mov $acc3, $t0
imulq 8*4($a_ptr), $acc3 # *= .LordK
shl \$32, %rax
shr \$32, %rdx
sub %rax, $acc1
mov 8*0($a_ptr), %rax
sbb %rdx, $acc2 # can't borrow
add $t1, $acc1
adc \$0, $acc2 # can't overflow
################################# Last reduction step
mul $acc3
mov $acc3, $t1
add %rax, $t0 # guaranteed to be zero
mov 8*1($a_ptr), %rax
adc %rdx, $t0
sub $acc3, $acc1
sbb \$0, $t1 # can't borrow
mul $acc3
add $t0, $acc0
adc \$0, %rdx
add %rax, $acc0
mov $acc3, %rax
adc %rdx, $acc1
mov $acc3, %rdx
adc \$0, $t1 # can't overflow
shl \$32, %rax
shr \$32, %rdx
sub %rax, $acc2
sbb %rdx, $acc3 # can't borrow
add $t1, $acc2
adc \$0, $acc3 # can't overflow
################################# Add bits [511:256] of the sqr result
xor %rdx, %rdx
add $acc4, $acc0
adc $acc5, $acc1
mov $acc0, $acc4
adc $acc6, $acc2
adc $acc7, $acc3
mov $acc1, %rax
adc \$0, %rdx
################################# Compare to modulus
sub 8*0($a_ptr), $acc0
mov $acc2, $acc6
sbb 8*1($a_ptr), $acc1
sbb 8*2($a_ptr), $acc2
mov $acc3, $acc7
sbb 8*3($a_ptr), $acc3
sbb \$0, %rdx
cmovc $acc4, $acc0
cmovnc $acc1, %rax
cmovnc $acc2, $acc6
cmovnc $acc3, $acc7
dec $b_ptr
jnz .Loop_ord_sqr
mov $acc0, 8*0($r_ptr)
mov %rax, 8*1($r_ptr)
pxor %xmm1, %xmm1
mov $acc6, 8*2($r_ptr)
pxor %xmm2, %xmm2
mov $acc7, 8*3($r_ptr)
pxor %xmm3, %xmm3
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbx
pop %rbp
ret
.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
___
$code.=<<___ if ($addx);
################################################################################
.type ecp_nistz256_ord_mul_montx,\@function,3
.align 32
ecp_nistz256_ord_mul_montx:
.Lecp_nistz256_ord_mul_montx:
push %rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
mov $b_org, $b_ptr
mov 8*0($b_org), %rdx
mov 8*0($a_ptr), $acc1
mov 8*1($a_ptr), $acc2
mov 8*2($a_ptr), $acc3
mov 8*3($a_ptr), $acc4
lea -128($a_ptr), $a_ptr # control u-op density
lea .Lord-128(%rip), %r14
mov .LordK(%rip), %r15
################################# Multiply by b[0]
mulx $acc1, $acc0, $acc1
mulx $acc2, $t0, $acc2
mulx $acc3, $t1, $acc3
add $t0, $acc1
mulx $acc4, $t0, $acc4
mov $acc0, %rdx
mulx %r15, %rdx, %rax
adc $t1, $acc2
adc $t0, $acc3
adc \$0, $acc4
################################# reduction
xor $acc5, $acc5 # $acc5=0, cf=0, of=0
mulx 8*0+128(%r14), $t0, $t1
adcx $t0, $acc0 # guaranteed to be zero
adox $t1, $acc1
mulx 8*1+128(%r14), $t0, $t1
adcx $t0, $acc1
adox $t1, $acc2
mulx 8*2+128(%r14), $t0, $t1
adcx $t0, $acc2
adox $t1, $acc3
mulx 8*3+128(%r14), $t0, $t1
mov 8*1($b_ptr), %rdx
adcx $t0, $acc3
adox $t1, $acc4
adcx $acc0, $acc4
adox $acc0, $acc5
adc \$0, $acc5 # cf=0, of=0
################################# Multiply by b[1]
mulx 8*0+128($a_ptr), $t0, $t1
adcx $t0, $acc1
adox $t1, $acc2
mulx 8*1+128($a_ptr), $t0, $t1
adcx $t0, $acc2
adox $t1, $acc3
mulx 8*2+128($a_ptr), $t0, $t1
adcx $t0, $acc3
adox $t1, $acc4
mulx 8*3+128($a_ptr), $t0, $t1
mov $acc1, %rdx
mulx %r15, %rdx, %rax
adcx $t0, $acc4
adox $t1, $acc5
adcx $acc0, $acc5
adox $acc0, $acc0
adc \$0, $acc0 # cf=0, of=0
################################# reduction
mulx 8*0+128(%r14), $t0, $t1
adcx $t0, $acc1 # guaranteed to be zero
adox $t1, $acc2
mulx 8*1+128(%r14), $t0, $t1
adcx $t0, $acc2
adox $t1, $acc3
mulx 8*2+128(%r14), $t0, $t1
adcx $t0, $acc3
adox $t1, $acc4
mulx 8*3+128(%r14), $t0, $t1
mov 8*2($b_ptr), %rdx
adcx $t0, $acc4
adox $t1, $acc5
adcx $acc1, $acc5
adox $acc1, $acc0
adc \$0, $acc0 # cf=0, of=0
################################# Multiply by b[2]
mulx 8*0+128($a_ptr), $t0, $t1
adcx $t0, $acc2
adox $t1, $acc3
mulx 8*1+128($a_ptr), $t0, $t1
adcx $t0, $acc3
adox $t1, $acc4
mulx 8*2+128($a_ptr), $t0, $t1
adcx $t0, $acc4
adox $t1, $acc5
mulx 8*3+128($a_ptr), $t0, $t1
mov $acc2, %rdx
mulx %r15, %rdx, %rax
adcx $t0, $acc5
adox $t1, $acc0
adcx $acc1, $acc0
adox $acc1, $acc1
adc \$0, $acc1 # cf=0, of=0
################################# reduction
mulx 8*0+128(%r14), $t0, $t1
adcx $t0, $acc2 # guaranteed to be zero
adox $t1, $acc3
mulx 8*1+128(%r14), $t0, $t1
adcx $t0, $acc3
adox $t1, $acc4
mulx 8*2+128(%r14), $t0, $t1
adcx $t0, $acc4
adox $t1, $acc5
mulx 8*3+128(%r14), $t0, $t1
mov 8*3($b_ptr), %rdx
adcx $t0, $acc5
adox $t1, $acc0
adcx $acc2, $acc0
adox $acc2, $acc1
adc \$0, $acc1 # cf=0, of=0
################################# Multiply by b[3]
mulx 8*0+128($a_ptr), $t0, $t1
adcx $t0, $acc3
adox $t1, $acc4
mulx 8*1+128($a_ptr), $t0, $t1
adcx $t0, $acc4
adox $t1, $acc5
mulx 8*2+128($a_ptr), $t0, $t1
adcx $t0, $acc5
adox $t1, $acc0
mulx 8*3+128($a_ptr), $t0, $t1
mov $acc3, %rdx
mulx %r15, %rdx, %rax
adcx $t0, $acc0
adox $t1, $acc1
adcx $acc2, $acc1
adox $acc2, $acc2
adc \$0, $acc2 # cf=0, of=0
################################# reduction
mulx 8*0+128(%r14), $t0, $t1
adcx $t0, $acc3 # guranteed to be zero
adox $t1, $acc4
mulx 8*1+128(%r14), $t0, $t1
adcx $t0, $acc4
adox $t1, $acc5
mulx 8*2+128(%r14), $t0, $t1
adcx $t0, $acc5
adox $t1, $acc0
mulx 8*3+128(%r14), $t0, $t1
lea 128(%r14),%r14
mov $acc4, $t2
adcx $t0, $acc0
adox $t1, $acc1
mov $acc5, $t3
adcx $acc3, $acc1
adox $acc3, $acc2
adc \$0, $acc2
#################################
# Branch-less conditional subtraction of P
mov $acc0, $t0
sub 8*0(%r14), $acc4
sbb 8*1(%r14), $acc5
sbb 8*2(%r14), $acc0
mov $acc1, $t1
sbb 8*3(%r14), $acc1
sbb \$0, $acc2
cmovc $t2, $acc4
cmovc $t3, $acc5
cmovc $t0, $acc0
cmovc $t1, $acc1
mov $acc4, 8*0($r_ptr)
mov $acc5, 8*1($r_ptr)
mov $acc0, 8*2($r_ptr)
mov $acc1, 8*3($r_ptr)
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbx
pop %rbp
ret
.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
.type ecp_nistz256_ord_sqr_montx,\@function,3
.align 32
ecp_nistz256_ord_sqr_montx:
.Lecp_nistz256_ord_sqr_montx:
push %rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
mov $b_org, $b_ptr
mov 8*0($a_ptr), %rdx
mov 8*1($a_ptr), $acc6
mov 8*2($a_ptr), $acc7
mov 8*3($a_ptr), $acc0
lea .Lord(%rip), $a_ptr
jmp .Loop_ord_sqrx
.align 32
.Loop_ord_sqrx:
mulx $acc6, $acc1, $acc2 # a[0]*a[1]
mulx $acc7, $t0, $acc3 # a[0]*a[2]
mov %rdx, %rax # offload a[0]
movq $acc6, %xmm1 # offload a[1]
mulx $acc0, $t1, $acc4 # a[0]*a[3]
mov $acc6, %rdx
add $t0, $acc2
movq $acc7, %xmm2 # offload a[2]
adc $t1, $acc3
adc \$0, $acc4
xor $acc5, $acc5 # $acc5=0,cf=0,of=0
#################################
mulx $acc7, $t0, $t1 # a[1]*a[2]
adcx $t0, $acc3
adox $t1, $acc4
mulx $acc0, $t0, $t1 # a[1]*a[3]
mov $acc7, %rdx
adcx $t0, $acc4
adox $t1, $acc5
adc \$0, $acc5
#################################
mulx $acc0, $t0, $acc6 # a[2]*a[3]
mov %rax, %rdx
movq $acc0, %xmm3 # offload a[3]
xor $acc7, $acc7 # $acc7=0,cf=0,of=0
adcx $acc1, $acc1 # acc1:6<<1
adox $t0, $acc5
adcx $acc2, $acc2
adox $acc7, $acc6 # of=0
################################# a[i]*a[i]
mulx %rdx, $acc0, $t1
movq %xmm1, %rdx
adcx $acc3, $acc3
adox $t1, $acc1
adcx $acc4, $acc4
mulx %rdx, $t0, $t4
movq %xmm2, %rdx
adcx $acc5, $acc5
adox $t0, $acc2
adcx $acc6, $acc6
mulx %rdx, $t0, $t1
.byte 0x67
movq %xmm3, %rdx
adox $t4, $acc3
adcx $acc7, $acc7
adox $t0, $acc4
adox $t1, $acc5
mulx %rdx, $t0, $t4
adox $t0, $acc6
adox $t4, $acc7
################################# reduction
mov $acc0, %rdx
mulx 8*4($a_ptr), %rdx, $t0
xor %rax, %rax # cf=0, of=0
mulx 8*0($a_ptr), $t0, $t1
adcx $t0, $acc0 # guaranteed to be zero
adox $t1, $acc1
mulx 8*1($a_ptr), $t0, $t1
adcx $t0, $acc1
adox $t1, $acc2
mulx 8*2($a_ptr), $t0, $t1
adcx $t0, $acc2
adox $t1, $acc3
mulx 8*3($a_ptr), $t0, $t1
adcx $t0, $acc3
adox $t1, $acc0 # of=0
adcx %rax, $acc0 # cf=0
#################################
mov $acc1, %rdx
mulx 8*4($a_ptr), %rdx, $t0
mulx 8*0($a_ptr), $t0, $t1
adox $t0, $acc1 # guaranteed to be zero
adcx $t1, $acc2
mulx 8*1($a_ptr), $t0, $t1
adox $t0, $acc2
adcx $t1, $acc3
mulx 8*2($a_ptr), $t0, $t1
adox $t0, $acc3
adcx $t1, $acc0
mulx 8*3($a_ptr), $t0, $t1
adox $t0, $acc0
adcx $t1, $acc1 # cf=0
adox %rax, $acc1 # of=0
#################################
mov $acc2, %rdx
mulx 8*4($a_ptr), %rdx, $t0
mulx 8*0($a_ptr), $t0, $t1
adcx $t0, $acc2 # guaranteed to be zero
adox $t1, $acc3
mulx 8*1($a_ptr), $t0, $t1
adcx $t0, $acc3
adox $t1, $acc0
mulx 8*2($a_ptr), $t0, $t1
adcx $t0, $acc0
adox $t1, $acc1
mulx 8*3($a_ptr), $t0, $t1
adcx $t0, $acc1
adox $t1, $acc2 # of=0
adcx %rax, $acc2 # cf=0
#################################
mov $acc3, %rdx
mulx 8*4($a_ptr), %rdx, $t0
mulx 8*0($a_ptr), $t0, $t1
adox $t0, $acc3 # guaranteed to be zero
adcx $t1, $acc0
mulx 8*1($a_ptr), $t0, $t1
adox $t0, $acc0
adcx $t1, $acc1
mulx 8*2($a_ptr), $t0, $t1
adox $t0, $acc1
adcx $t1, $acc2
mulx 8*3($a_ptr), $t0, $t1
adox $t0, $acc2
adcx $t1, $acc3
adox %rax, $acc3
################################# accumulate upper half
add $acc0, $acc4 # add $acc4, $acc0
adc $acc5, $acc1
mov $acc4, %rdx
adc $acc6, $acc2
adc $acc7, $acc3
mov $acc1, $acc6
adc \$0, %rax
################################# compare to modulus
sub 8*0($a_ptr), $acc4
mov $acc2, $acc7
sbb 8*1($a_ptr), $acc1
sbb 8*2($a_ptr), $acc2
mov $acc3, $acc0
sbb 8*3($a_ptr), $acc3
sbb \$0, %rax
cmovnc $acc4, %rdx
cmovnc $acc1, $acc6
cmovnc $acc2, $acc7
cmovnc $acc3, $acc0
dec $b_ptr
jnz .Loop_ord_sqrx
mov %rdx, 8*0($r_ptr)
mov $acc6, 8*1($r_ptr)
pxor %xmm1, %xmm1
mov $acc7, 8*2($r_ptr)
pxor %xmm2, %xmm2
mov $acc0, 8*3($r_ptr)
pxor %xmm3, %xmm3
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbx
pop %rbp
ret
.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
___
$code.=<<___;
################################################################################
# void ecp_nistz256_to_mont(
......
......@@ -48,6 +48,8 @@ static const ERR_STRING_DATA EC_str_functs[] = {
"ECPKParameters_print_fp"},
{ERR_PACK(ERR_LIB_EC, EC_F_ECP_NISTZ256_GET_AFFINE, 0),
"ecp_nistz256_get_affine"},
{ERR_PACK(ERR_LIB_EC, EC_F_ECP_NISTZ256_INV_MOD_ORD, 0),
"ecp_nistz256_inv_mod_ord"},
{ERR_PACK(ERR_LIB_EC, EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, 0),
"ecp_nistz256_mult_precompute"},
{ERR_PACK(ERR_LIB_EC, EC_F_ECP_NISTZ256_POINTS_MUL, 0),
......
......@@ -155,6 +155,9 @@ struct ec_method_st {
/* custom ECDH operation */
int (*ecdh_compute_key)(unsigned char **pout, size_t *poutlen,
const EC_POINT *pub_key, const EC_KEY *ecdh);
/* Inverse modulo order */
int (*field_inverse_mod_ord)(const EC_GROUP *, BIGNUM *r, BIGNUM *x,
BN_CTX *ctx);
};
/*
......@@ -520,7 +523,6 @@ void ec_GFp_nistp_points_make_affine_internal(size_t num, void *point_array,
void ec_GFp_nistp_recode_scalar_bits(unsigned char *sign,
unsigned char *digit, unsigned char in);
#endif
int ec_precompute_mont_data(EC_GROUP *);
int ec_group_simple_order_bits(const EC_GROUP *group);
#ifdef ECP_NISTZ256_ASM
......@@ -604,3 +606,6 @@ int X25519(uint8_t out_shared_key[32], const uint8_t private_key[32],
const uint8_t peer_public_value[32]);
void X25519_public_from_private(uint8_t out_public_value[32],
const uint8_t private_key[32]);
int EC_GROUP_do_inverse_ord(const EC_GROUP *group, BIGNUM *res,
BIGNUM *x, BN_CTX *ctx);
......@@ -261,6 +261,8 @@ int EC_METHOD_get_field_type(const EC_METHOD *meth)
return meth->field_type;
}
static int ec_precompute_mont_data(EC_GROUP *);
int EC_GROUP_set_generator(EC_GROUP *group, const EC_POINT *generator,
const BIGNUM *order, const BIGNUM *cofactor)
{
......@@ -961,7 +963,7 @@ int EC_GROUP_have_precompute_mult(const EC_GROUP *group)
* ec_precompute_mont_data sets |group->mont_data| from |group->order| and
* returns one on success. On error it returns zero.
*/
int ec_precompute_mont_data(EC_GROUP *group)
static int ec_precompute_mont_data(EC_GROUP *group)
{
BN_CTX *ctx = BN_CTX_new();
int ret = 0;
......@@ -1006,3 +1008,12 @@ int ec_group_simple_order_bits(const EC_GROUP *group)
return 0;
return BN_num_bits(group->order);
}
int EC_GROUP_do_inverse_ord(const EC_GROUP *group, BIGNUM *res,
BIGNUM *x, BN_CTX *ctx)
{
if (group->meth->field_inverse_mod_ord != NULL)
return group->meth->field_inverse_mod_ord(group, res, x, ctx);
else
return 0;
}
......@@ -153,11 +153,13 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in,
}
while (BN_is_zero(r));
/* Check if optimized inverse is implemented */
if (EC_GROUP_do_inverse_ord(group, k, k, ctx) == 0) {
/* compute the inverse of k */
if (EC_GROUP_get_mont_data(group) != NULL) {
if (group->mont_data != NULL) {
/*
* We want inverse in constant time, therefore we utilize the fact
* order must be prime and use Fermat's Little Theorem instead.
* order must be prime and use Fermats Little Theorem instead.
*/
if (!BN_set_word(X, 2)) {
ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
......@@ -168,8 +170,8 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in,
goto err;
}
BN_set_flags(X, BN_FLG_CONSTTIME);
if (!BN_mod_exp_mont_consttime
(k, k, X, order, ctx, EC_GROUP_get_mont_data(group))) {
if (!BN_mod_exp_mont_consttime(k, k, X, order, ctx,
group->mont_data)) {
ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
goto err;
}
......@@ -179,6 +181,7 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in,
goto err;
}
}
}
/* clear old values if necessary */
BN_clear_free(*rp);
......@@ -407,10 +410,13 @@ int ossl_ecdsa_verify_sig(const unsigned char *dgst, int dgst_len,
goto err;
}
/* calculate tmp1 = inv(S) mod order */
/* Check if optimized inverse is implemented */
if (EC_GROUP_do_inverse_ord(group, u2, sig->s, ctx) == 0) {
if (!BN_mod_inverse(u2, sig->s, order, ctx)) {
ECerr(EC_F_OSSL_ECDSA_VERIFY_SIG, ERR_R_BN_LIB);
goto err;
}
}
/* digest -> m */
i = BN_num_bits(order);
/*
......
/*
* Copyright 2014-2017 The OpenSSL Project Authors. All Rights Reserved.
* Copyright (c) 2014, Intel Corporation. All Rights Reserved.
* Copyright (c) 2015, CloudFlare, Inc.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*
* Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
* Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
* (1) Intel Corporation, Israel Development Center, Haifa, Israel
* (2) University of Haifa, Israel
* (3) CloudFlare, Inc.
*
* Reference:
* S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
......@@ -908,7 +910,7 @@ __owur static int ecp_nistz256_mult_precompute(EC_GROUP *group, BN_CTX *ctx)
*/
#if defined(ECP_NISTZ256_AVX2)
# if !(defined(__x86_64) || defined(__x86_64__) || \
defined(_M_AMD64) || defined(_MX64)) || \
defined(_M_AMD64) || defined(_M_X64)) || \
!(defined(__GNUC__) || defined(_MSC_VER)) /* this is for ALIGN32 */
# undef ECP_NISTZ256_AVX2
# else
......@@ -1495,6 +1497,117 @@ static int ecp_nistz256_window_have_precompute_mult(const EC_GROUP *group)
return HAVEPRECOMP(group, nistz256);
}
#if defined(__x86_64) || defined(__x86_64__) || \
defined(_M_AMD64) || defined(_M_X64) || \
defined(__powerpc64__) || defined(_ARCH_PP64)
/*
* Montgomery mul modulo Order(P): res = a*b*2^-256 mod Order(P)
*/
void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS],
const BN_ULONG b[P256_LIMBS]);
void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS],
int rep);
static int ecp_nistz256_inv_mod_ord(const EC_GROUP *group, BIGNUM *r,
BIGNUM *x, BN_CTX *ctx)
{
/* RR = 2^512 mod ord(p256) */
static const BN_ULONG RR[P256_LIMBS] = { TOBN(0x83244c95,0xbe79eea2),
TOBN(0x4699799c,0x49bd6fa6),
TOBN(0x2845b239,0x2b6bec59),
TOBN(0x66e12d94,0xf3d95620) };
/* The constant 1 (unlike ONE that is one in Montgomery representation) */
static const BN_ULONG one[P256_LIMBS] = { TOBN(0,1),TOBN(0,0),
TOBN(0,0),TOBN(0,0) };
/* expLo - the low 128bit of the exponent we use (ord(p256) - 2),
* split into 4bit windows */
static const unsigned char expLo[32] = { 0xb,0xc,0xe,0x6,0xf,0xa,0xa,0xd,
0xa,0x7,0x1,0x7,0x9,0xe,0x8,0x4,
0xf,0x3,0xb,0x9,0xc,0xa,0xc,0x2,
0xf,0xc,0x6,0x3,0x2,0x5,0x4,0xf };
/*
* We don't use entry 0 in the table, so we omit it and address
* with -1 offset.
*/
BN_ULONG table[15][P256_LIMBS];
BN_ULONG out[P256_LIMBS], t[P256_LIMBS];
int i, ret = 0;
/*
* Catch allocation failure early.
*/
if (bn_wexpand(r, P256_LIMBS) == NULL) {
ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, ERR_R_BN_LIB);
goto err;
}
if ((BN_num_bits(x) > 256) || BN_is_negative(x)) {
BIGNUM *tmp;
if ((tmp = BN_CTX_get(ctx)) == NULL
|| !BN_nnmod(tmp, x, group->order, ctx)) {
ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, ERR_R_BN_LIB);
goto err;
}
x = tmp;
}
if (!ecp_nistz256_bignum_to_field_elem(t, x)) {
ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, EC_R_COORDINATES_OUT_OF_RANGE);
goto err;
}
ecp_nistz256_ord_mul_mont(table[0], t, RR);
for (i = 2; i < 16; i += 2) {
ecp_nistz256_ord_sqr_mont(table[i-1], table[i/2-1], 1);
ecp_nistz256_ord_mul_mont(table[i], table[i-1], table[0]);
}
/*
* The top 128bit of the exponent are highly redudndant, so we
* perform an optimized flow
*/
ecp_nistz256_ord_sqr_mont(t, table[15-1], 4); /* f0 */
ecp_nistz256_ord_mul_mont(t, t, table[15-1]); /* ff */
ecp_nistz256_ord_sqr_mont(out, t, 8); /* ff00 */
ecp_nistz256_ord_mul_mont(out, out, t); /* ffff */
ecp_nistz256_ord_sqr_mont(t, out, 16); /* ffff0000 */
ecp_nistz256_ord_mul_mont(t, t, out); /* ffffffff */
ecp_nistz256_ord_sqr_mont(out, t, 64); /* ffffffff0000000000000000 */
ecp_nistz256_ord_mul_mont(out, out, t); /* ffffffff00000000ffffffff */
ecp_nistz256_ord_sqr_mont(out, out, 32); /* ffffffff00000000ffffffff00000000 */
ecp_nistz256_ord_mul_mont(out, out, t); /* ffffffff00000000ffffffffffffffff */
/*
* The bottom 128 bit of the exponent are easier done with a table
*/
for(i = 0; i < 32; i++) {
ecp_nistz256_ord_sqr_mont(out, out, 4);
/* The exponent is public, no need in constant-time access */
ecp_nistz256_ord_mul_mont(out, out, table[expLo[i]-1]);
}
ecp_nistz256_ord_mul_mont(out, out, one);
/*
* Can't fail, but check return code to be consistent anyway.
*/
if (!bn_set_words(r, out, P256_LIMBS))
goto err;
ret = 1;
err:
return ret;
}
#else
# define ecp_nistz256_inv_mod_ord NULL
#endif
const EC_METHOD *EC_GFp_nistz256_method(void)
{
static const EC_METHOD ret = {
......@@ -1544,7 +1657,8 @@ const EC_METHOD *EC_GFp_nistz256_method(void)
ec_key_simple_generate_public_key,
0, /* keycopy */
0, /* keyfinish */
ecdh_simple_compute_key
ecdh_simple_compute_key,
ecp_nistz256_inv_mod_ord /* can be #defined-ed NULL */
};
return &ret;
......
......@@ -458,6 +458,7 @@ EC_F_ECPARAMETERS_PRINT_FP:148:ECParameters_print_fp
EC_F_ECPKPARAMETERS_PRINT:149:ECPKParameters_print
EC_F_ECPKPARAMETERS_PRINT_FP:150:ECPKParameters_print_fp
EC_F_ECP_NISTZ256_GET_AFFINE:240:ecp_nistz256_get_affine
EC_F_ECP_NISTZ256_INV_MOD_ORD:275:ecp_nistz256_inv_mod_ord
EC_F_ECP_NISTZ256_MULT_PRECOMPUTE:243:ecp_nistz256_mult_precompute
EC_F_ECP_NISTZ256_POINTS_MUL:241:ecp_nistz256_points_mul
EC_F_ECP_NISTZ256_PRE_COMP_NEW:244:ecp_nistz256_pre_comp_new
......
......@@ -50,6 +50,7 @@ int ERR_load_EC_strings(void);
# define EC_F_ECPKPARAMETERS_PRINT 149
# define EC_F_ECPKPARAMETERS_PRINT_FP 150
# define EC_F_ECP_NISTZ256_GET_AFFINE 240
# define EC_F_ECP_NISTZ256_INV_MOD_ORD 275
# define EC_F_ECP_NISTZ256_MULT_PRECOMPUTE 243
# define EC_F_ECP_NISTZ256_POINTS_MUL 241
# define EC_F_ECP_NISTZ256_PRE_COMP_NEW 244
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册