提交 592eef5c 编写于 作者: A Andy Polyakov

s390x assembly pack: add ChaCha20 and Poly1305 modules.

Reviewed-by: NTim Hudson <tjh@openssl.org>
上级 5e355e0c
......@@ -36,6 +36,8 @@ lib: $(LIBOBJ)
$(RANLIB) $(LIB) || echo Never mind.
@touch lib
chacha-%.S: asm/chacha-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
......
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# December 2015
#
# ChaCha20 for s390x.
#
# 3 times faster than compiler-generated code.
$flavour = shift;
if ($flavour =~ /3[12]/) {
$SIZE_T=4;
$g="";
} else {
$SIZE_T=8;
$g="g";
}
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
sub AUTOLOAD() # thunk [simplified] x86-style perlasm
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
$code .= "\t$opcode\t".join(',',@_)."\n";
}
my $sp="%r15";
my $stdframe=16*$SIZE_T+4*8;
my $frame=$stdframe+4*20;
my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
my @t=map("%r$_",(8,9));
sub ROUND {
my ($a0,$b0,$c0,$d0)=@_;
my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
my ($xc,$xc_)=map("\"$_\"",@t);
my @x=map("\"$_\"",@x);
# Consider order in which variables are addressed by their
# index:
#
# a b c d
#
# 0 4 8 12 < even round
# 1 5 9 13
# 2 6 10 14
# 3 7 11 15
# 0 5 10 15 < odd round
# 1 6 11 12
# 2 7 8 13
# 3 4 9 14
#
# 'a', 'b' and 'd's are permanently allocated in registers,
# @x[0..7,12..15], while 'c's are maintained in memory. If
# you observe 'c' column, you'll notice that pair of 'c's is
# invariant between rounds. This means that we have to reload
# them once per round, in the middle. This is why you'll see
# 'c' stores and loads in the middle, but none in the beginning
# or end.
(
"&alr (@x[$a0],@x[$b0])", # Q1
"&alr (@x[$a1],@x[$b1])", # Q2
"&xr (@x[$d0],@x[$a0])",
"&xr (@x[$d1],@x[$a1])",
"&rll (@x[$d0],@x[$d0],16)",
"&rll (@x[$d1],@x[$d1],16)",
"&alr ($xc,@x[$d0])",
"&alr ($xc_,@x[$d1])",
"&xr (@x[$b0],$xc)",
"&xr (@x[$b1],$xc_)",
"&rll (@x[$b0],@x[$b0],12)",
"&rll (@x[$b1],@x[$b1],12)",
"&alr (@x[$a0],@x[$b0])",
"&alr (@x[$a1],@x[$b1])",
"&xr (@x[$d0],@x[$a0])",
"&xr (@x[$d1],@x[$a1])",
"&rll (@x[$d0],@x[$d0],8)",
"&rll (@x[$d1],@x[$d1],8)",
"&alr ($xc,@x[$d0])",
"&alr ($xc_,@x[$d1])",
"&xr (@x[$b0],$xc)",
"&xr (@x[$b1],$xc_)",
"&rll (@x[$b0],@x[$b0],7)",
"&rll (@x[$b1],@x[$b1],7)",
"&stm ($xc,$xc_,'$stdframe+4*8+4*$c0($sp)')", # reload pair of 'c's
"&lm ($xc,$xc_,'$stdframe+4*8+4*$c2($sp)')",
"&alr (@x[$a2],@x[$b2])", # Q3
"&alr (@x[$a3],@x[$b3])", # Q4
"&xr (@x[$d2],@x[$a2])",
"&xr (@x[$d3],@x[$a3])",
"&rll (@x[$d2],@x[$d2],16)",
"&rll (@x[$d3],@x[$d3],16)",
"&alr ($xc,@x[$d2])",
"&alr ($xc_,@x[$d3])",
"&xr (@x[$b2],$xc)",
"&xr (@x[$b3],$xc_)",
"&rll (@x[$b2],@x[$b2],12)",
"&rll (@x[$b3],@x[$b3],12)",
"&alr (@x[$a2],@x[$b2])",
"&alr (@x[$a3],@x[$b3])",
"&xr (@x[$d2],@x[$a2])",
"&xr (@x[$d3],@x[$a3])",
"&rll (@x[$d2],@x[$d2],8)",
"&rll (@x[$d3],@x[$d3],8)",
"&alr ($xc,@x[$d2])",
"&alr ($xc_,@x[$d3])",
"&xr (@x[$b2],$xc)",
"&xr (@x[$b3],$xc_)",
"&rll (@x[$b2],@x[$b2],7)",
"&rll (@x[$b3],@x[$b3],7)"
);
}
$code.=<<___;
.text
.globl ChaCha20_ctr32
.type ChaCha20_ctr32,\@function
.align 32
ChaCha20_ctr32:
a${g}hi $len,-64
l${g}hi %r1,-$frame
stm${g} %r6,%r15,`6*$SIZE_T`($sp)
sl${g}r $out,$inp # difference
la $len,0($inp,$len) # end of input minus 64
larl %r7,.Lsigma
lgr %r0,$sp
la $sp,0(%r1,$sp)
st${g} %r0,0($sp)
lmg %r8,%r11,0($key) # load key
lmg %r12,%r13,0($counter) # load counter
lmg %r6,%r7,0(%r7) # load sigma constant
la %r14,0($inp)
st${g} $out,$frame+3*$SIZE_T($sp)
st${g} $len,$frame+4*$SIZE_T($sp)
stmg %r6,%r13,$stdframe($sp) # copy key schedule to stack
srlg @x[12],%r12,32 # 32-bit counter value
j .Loop_outer
.align 16
.Loop_outer:
lm @x[0],@x[7],$stdframe+4*0($sp) # load x[0]-x[7]
lm @t[0],@t[1],$stdframe+4*10($sp) # load x[10]-x[11]
lm @x[13],@x[15],$stdframe+4*13($sp) # load x[13]-x[15]
stm @t[0],@t[1],$stdframe+4*8+4*10($sp) # offload x[10]-x[11]
lm @t[0],@t[1],$stdframe+4*8($sp) # load x[8]-x[9]
st @x[12],$stdframe+4*12($sp) # save counter
st${g} %r14,$frame+2*$SIZE_T($sp) # save input pointer
lhi %r14,10
j .Loop
.align 4
.Loop:
___
foreach (&ROUND(0, 4, 8,12)) { eval; }
foreach (&ROUND(0, 5,10,15)) { eval; }
$code.=<<___;
brct %r14,.Loop
l${g} %r14,$frame+2*$SIZE_T($sp) # pull input pointer
stm @t[0],@t[1],$stdframe+4*8+4*8($sp) # offload x[8]-x[9]
lm${g} @t[0],@t[1],$frame+3*$SIZE_T($sp)
al @x[0],$stdframe+4*0($sp) # accumulate key schedule
al @x[1],$stdframe+4*1($sp)
al @x[2],$stdframe+4*2($sp)
al @x[3],$stdframe+4*3($sp)
al @x[4],$stdframe+4*4($sp)
al @x[5],$stdframe+4*5($sp)
al @x[6],$stdframe+4*6($sp)
al @x[7],$stdframe+4*7($sp)
lrvr @x[0],@x[0]
lrvr @x[1],@x[1]
lrvr @x[2],@x[2]
lrvr @x[3],@x[3]
lrvr @x[4],@x[4]
lrvr @x[5],@x[5]
lrvr @x[6],@x[6]
lrvr @x[7],@x[7]
al @x[12],$stdframe+4*12($sp)
al @x[13],$stdframe+4*13($sp)
al @x[14],$stdframe+4*14($sp)
al @x[15],$stdframe+4*15($sp)
lrvr @x[12],@x[12]
lrvr @x[13],@x[13]
lrvr @x[14],@x[14]
lrvr @x[15],@x[15]
la @t[0],0(@t[0],%r14) # reconstruct output pointer
cl${g}r %r14,@t[1]
jh .Ltail
x @x[0],4*0(%r14) # xor with input
x @x[1],4*1(%r14)
st @x[0],4*0(@t[0]) # store output
x @x[2],4*2(%r14)
st @x[1],4*1(@t[0])
x @x[3],4*3(%r14)
st @x[2],4*2(@t[0])
x @x[4],4*4(%r14)
st @x[3],4*3(@t[0])
lm @x[0],@x[3],$stdframe+4*8+4*8($sp) # load x[8]-x[11]
x @x[5],4*5(%r14)
st @x[4],4*4(@t[0])
x @x[6],4*6(%r14)
al @x[0],$stdframe+4*8($sp)
st @x[5],4*5(@t[0])
x @x[7],4*7(%r14)
al @x[1],$stdframe+4*9($sp)
st @x[6],4*6(@t[0])
x @x[12],4*12(%r14)
al @x[2],$stdframe+4*10($sp)
st @x[7],4*7(@t[0])
x @x[13],4*13(%r14)
al @x[3],$stdframe+4*11($sp)
st @x[12],4*12(@t[0])
x @x[14],4*14(%r14)
st @x[13],4*13(@t[0])
x @x[15],4*15(%r14)
st @x[14],4*14(@t[0])
lrvr @x[0],@x[0]
st @x[15],4*15(@t[0])
lrvr @x[1],@x[1]
lrvr @x[2],@x[2]
lrvr @x[3],@x[3]
lhi @x[12],1
x @x[0],4*8(%r14)
al @x[12],$stdframe+4*12($sp) # increment counter
x @x[1],4*9(%r14)
st @x[0],4*8(@t[0])
x @x[2],4*10(%r14)
st @x[1],4*9(@t[0])
x @x[3],4*11(%r14)
st @x[2],4*10(@t[0])
la %r14,64(%r14)
st @x[3],4*11(@t[0])
cl${g}r %r14,@t[1] # done yet?
jle .Loop_outer
.Ldone:
xgr %r0,%r0
xgr %r1,%r1
xgr %r2,%r2
xgr %r3,%r3
stmg %r0,%r3,$stdframe+4*4($sp) # wipe key copy
stmg %r0,%r3,$stdframe+4*12($sp)
lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
br %r14
.align 16
.Ltail:
la @t[1],64($t[1])
stm @x[0],@x[7],$stdframe+4*0($sp)
sl${g}r @t[1],%r14
lm @x[0],@x[3],$stdframe+4*8+4*8($sp)
l${g}hi @x[6],0
stm @x[12],@x[15],$stdframe+4*12($sp)
al @x[0],$stdframe+4*8($sp)
al @x[1],$stdframe+4*9($sp)
al @x[2],$stdframe+4*10($sp)
al @x[3],$stdframe+4*11($sp)
lrvr @x[0],@x[0]
lrvr @x[1],@x[1]
lrvr @x[2],@x[2]
lrvr @x[3],@x[3]
stm @x[0],@x[3],$stdframe+4*8+4*8($sp)
.Loop_tail:
llgc @x[4],0(@x[6],%r14)
llgc @x[5],$stdframe(@x[6],$sp)
xr @x[5],@x[4]
stc @x[5],0(@x[6],@t[0])
la @x[6],1(@x[6])
brct @t[1],.Loop_tail
j .Ldone
.size ChaCha20_ctr32,.-ChaCha20_ctr32
.align 32
.Lsigma:
.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
.asciz "ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
print $_,"\n";
}
close STDOUT;
......@@ -39,6 +39,8 @@ lib: $(LIBOBJ)
poly1305-sparcv9.S: asm/poly1305-sparcv9.pl
$(PERL) asm/poly1305-sparcv9.pl > $@
poly1305-%.S: asm/poly1305-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
......
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements Poly1305 hash for s390x.
#
# June 2015
#
# ~6.4/2.2 cpb on z10/z196+, >2x improvement over compiler-generated
# code. For older compiler improvement coefficient is >3x, because
# then base 2^64 and base 2^32 implementations are compared.
#
# On side note, z13 enables vector base 2^26 implementation...
$flavour = shift;
if ($flavour =~ /3[12]/) {
$SIZE_T=4;
$g="";
} else {
$SIZE_T=8;
$g="g";
}
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$sp="%r15";
my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5));
$code.=<<___;
.text
.globl poly1305_init
.type poly1305_init,\@function
.align 16
poly1305_init:
lghi %r0,0
lghi %r1,-1
stg %r0,0($ctx) # zero hash value
stg %r0,8($ctx)
stg %r0,16($ctx)
cl${g}r $inp,%r0
je .Lno_key
lrvg %r4,0($inp) # load little-endian key
lrvg %r5,8($inp)
nihl %r1,0xffc0 # 0xffffffc0ffffffff
srlg %r0,%r1,4 # 0x0ffffffc0fffffff
srlg %r1,%r1,4
nill %r1,0xfffc # 0x0ffffffc0ffffffc
ngr %r4,%r0
ngr %r5,%r1
stg %r4,32($ctx)
stg %r5,40($ctx)
.Lno_key:
lghi %r2,0
br %r14
.size poly1305_init,.-poly1305_init
___
{
my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14));
my ($r0,$r1,$s1) = map("%r$_",(0..2));
$code.=<<___;
.globl poly1305_blocks
.type poly1305_blocks,\@function
.align 16
poly1305_blocks:
srl${g} $len,$len,4
lghi %r0,0
cl${g}r $len,%r0
je .Lno_data
stm${g} %r6,%r14,`6*$SIZE_T`($sp)
lg $r0,32($ctx) # load key
lg $r1,40($ctx)
lg $h0,0($ctx) # load hash value
lg $h1,8($ctx)
lg $h2,16($ctx)
st$g $ctx,`2*$SIZE_T`($sp) # off-load $ctx
srlg $s1,$r1,2
algr $s1,$r1 # s1 = r1 + r1>>2
j .Loop
.align 16
.Loop:
lrvg $d0lo,0($inp) # load little-endian input
lrvg $d1lo,8($inp)
la $inp,16($inp)
algr $d0lo,$h0 # accumulate input
alcgr $d1lo,$h1
lgr $h0,$d0lo
mlgr $d0hi,$r0 # h0*r0 -> $d0hi:$d0lo
lgr $h1,$d1lo
mlgr $d1hi,$s1 # h1*5*r1 -> $d1hi:$d1lo
mlgr $t0,$r1 # h0*r1 -> $t0:$h0
mlgr $t1,$r0 # h1*r0 -> $t1:$h1
alcgr $h2,$padbit
algr $d0lo,$d1lo
lgr $d1lo,$h2
alcgr $d0hi,$d1hi
lghi $d1hi,0
algr $h1,$h0
alcgr $t1,$t0
msgr $d1lo,$s1 # h2*s1
msgr $h2,$r0 # h2*r0
algr $h1,$d1lo
alcgr $t1,$d1hi # $d1hi is zero
algr $h1,$d0hi
alcgr $h2,$t1
lghi $h0,-4 # final reduction step
ngr $h0,$h2
srlg $t0,$h2,2
algr $h0,$t0
algr $h0,$d0lo
lghi $t1,3
alcgr $h1,$d1hi # $d1hi is still zero
ngr $h2,$t1
brct$g $len,.Loop
l$g $ctx,`2*$SIZE_T`($sp) # restore $ctx
stg $h0,0($ctx) # store hash value
stg $h1,8($ctx)
stg $h2,16($ctx)
lm${g} %r6,%r14,`6*$SIZE_T`($sp)
.Lno_data:
br %r14
.size poly1305_blocks,.-poly1305_blocks
___
}
{
my ($mac,$nonce)=($inp,$len);
my ($h0,$h1,$h2,$d0,$d1)=map("%r$_",(5..9));
$code.=<<___;
.globl poly1305_emit
.type poly1305_emit,\@function
.align 16
poly1305_emit:
stm${g} %r6,%r9,`6*$SIZE_T`($sp)
lg $h0,0($ctx)
lg $h1,8($ctx)
lg $h2,16($ctx)
lghi %r0,5
lghi %r1,0
lgr $d0,$h0
lgr $d1,$h1
algr $h0,%r0 # compare to modulus
alcgr $h1,%r1
alcgr $h2,%r1
srlg $h2,$h2,2 # did it borrow/carry?
slgr %r1,$h2 # 0-$h2>>2
lg $h2,0($nonce) # load nonce
lghi %r0,-1
lg $ctx,8($nonce)
xgr %r0,%r1 # ~%r1
ngr $h0,%r1
ngr $d0,%r0
ngr $h1,%r1
ngr $d1,%r0
ogr $h0,$d0
rllg $d0,$h2,32 # flip nonce words
ogr $h1,$d1
rllg $d1,$ctx,32
algr $h0,$d0 # accumulate nonce
alcgr $h1,$d1
strvg $h0,0($mac) # write little-endian result
strvg $h1,8($mac)
lm${g} %r6,%r9,`6*$SIZE_T`($sp)
br %r14
.size poly1305_emit,.-poly1305_emit
.string "Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
}
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册