提交 1fdcef75 编写于 作者: A Andy Polyakov

ARM assembly pack: add ChaCha20 and Poly1305 modules.

Reviewed-by: NRichard Levitte <levitte@openssl.org>
上级 6d9843e7
...@@ -43,6 +43,9 @@ chacha-x86_64.s: asm/chacha-x86_64.pl ...@@ -43,6 +43,9 @@ chacha-x86_64.s: asm/chacha-x86_64.pl
chacha-%.S: asm/chacha-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ chacha-%.S: asm/chacha-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
chacha-armv4.o: chacha-armv4.S
chacha-armv8.o: chacha-armv8.S
files: files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
......
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# December 2014
#
# ChaCha20 for ARMv4.
#
# Performance in cycles per byte out of large buffer.
#
# IALU/gcc-4.4 1xNEON 3xNEON+1xIALU
#
# Cortex-A5 19.3(*)/+95% 21.8 14.1
# Cortex-A8 10.5(*)/+160% 13.9 6.35
# Cortex-A9 12.9(**)/+110% 14.3 6.50
# Cortex-A15 11.0/+40% 16.0 5.00
# Snapdragon S4 11.5/+125% 13.6 4.90
#
# (*) most "favourable" result for aligned data on little-endian
# processor, result for misaligned data is 10-15% lower;
# (**) this result is a trade-off: it can be improved by 20%,
# but then Snapdragon S4 and Cortex-A8 results get
# 20-25% worse;
$flavour = shift;
if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
sub AUTOLOAD() # thunk [simplified] x86-style perlasm
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
my $arg = pop;
$arg = "#$arg" if ($arg*1 eq $arg);
$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
}
my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
my @t=map("r$_",(8..11));
sub ROUND {
my ($a0,$b0,$c0,$d0)=@_;
my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
my $odd = $d0&1;
my ($xc,$xc_) = (@t[0..1]);
my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
my @ret;
# Consider order in which variables are addressed by their
# index:
#
# a b c d
#
# 0 4 8 12 < even round
# 1 5 9 13
# 2 6 10 14
# 3 7 11 15
# 0 5 10 15 < odd round
# 1 6 11 12
# 2 7 8 13
# 3 4 9 14
#
# 'a', 'b' are permanently allocated in registers, @x[0..7],
# while 'c's and pair of 'd's are maintained in memory. If
# you observe 'c' column, you'll notice that pair of 'c's is
# invariant between rounds. This means that we have to reload
# them once per round, in the middle. This is why you'll see
# bunch of 'c' stores and loads in the middle, but none in
# the beginning or end. If you observe 'd' column, you'll
# notice that 15 and 13 are reused in next pair of rounds.
# This is why these two are chosen for offloading to memory,
# to make loads count more.
push @ret,(
"&add (@x[$a0],@x[$a0],@x[$b0])",
"&mov ($xd,$xd,'ror#16')",
"&add (@x[$a1],@x[$a1],@x[$b1])",
"&mov ($xd_,$xd_,'ror#16')",
"&eor ($xd,$xd,@x[$a0],'ror#16')",
"&eor ($xd_,$xd_,@x[$a1],'ror#16')",
"&add ($xc,$xc,$xd)",
"&mov (@x[$b0],@x[$b0],'ror#20')",
"&add ($xc_,$xc_,$xd_)",
"&mov (@x[$b1],@x[$b1],'ror#20')",
"&eor (@x[$b0],@x[$b0],$xc,'ror#20')",
"&eor (@x[$b1],@x[$b1],$xc_,'ror#20')",
"&add (@x[$a0],@x[$a0],@x[$b0])",
"&mov ($xd,$xd,'ror#24')",
"&add (@x[$a1],@x[$a1],@x[$b1])",
"&mov ($xd_,$xd_,'ror#24')",
"&eor ($xd,$xd,@x[$a0],'ror#24')",
"&eor ($xd_,$xd_,@x[$a1],'ror#24')",
"&add ($xc,$xc,$xd)",
"&mov (@x[$b0],@x[$b0],'ror#25')" );
push @ret,(
"&str ($xd,'[sp,#4*(16+$d0)]')",
"&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd);
push @ret,(
"&add ($xc_,$xc_,$xd_)",
"&mov (@x[$b1],@x[$b1],'ror#25')" );
push @ret,(
"&str ($xd_,'[sp,#4*(16+$d1)]')",
"&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd);
push @ret,(
"&eor (@x[$b0],@x[$b0],$xc,'ror#25')",
"&eor (@x[$b1],@x[$b1],$xc_,'ror#25')" );
$xd=@x[$d2] if (!$odd);
$xd_=@x[$d3] if ($odd);
push @ret,(
"&str ($xc,'[sp,#4*(16+$c0)]')",
"&ldr ($xc,'[sp,#4*(16+$c2)]')",
"&add (@x[$a2],@x[$a2],@x[$b2])",
"&mov ($xd,$xd,'ror#16')",
"&str ($xc_,'[sp,#4*(16+$c1)]')",
"&ldr ($xc_,'[sp,#4*(16+$c3)]')",
"&add (@x[$a3],@x[$a3],@x[$b3])",
"&mov ($xd_,$xd_,'ror#16')",
"&eor ($xd,$xd,@x[$a2],'ror#16')",
"&eor ($xd_,$xd_,@x[$a3],'ror#16')",
"&add ($xc,$xc,$xd)",
"&mov (@x[$b2],@x[$b2],'ror#20')",
"&add ($xc_,$xc_,$xd_)",
"&mov (@x[$b3],@x[$b3],'ror#20')",
"&eor (@x[$b2],@x[$b2],$xc,'ror#20')",
"&eor (@x[$b3],@x[$b3],$xc_,'ror#20')",
"&add (@x[$a2],@x[$a2],@x[$b2])",
"&mov ($xd,$xd,'ror#24')",
"&add (@x[$a3],@x[$a3],@x[$b3])",
"&mov ($xd_,$xd_,'ror#24')",
"&eor ($xd,$xd,@x[$a2],'ror#24')",
"&eor ($xd_,$xd_,@x[$a3],'ror#24')",
"&add ($xc,$xc,$xd)",
"&mov (@x[$b2],@x[$b2],'ror#25')",
"&add ($xc_,$xc_,$xd_)",
"&mov (@x[$b3],@x[$b3],'ror#25')",
"&eor (@x[$b2],@x[$b2],$xc,'ror#25')",
"&eor (@x[$b3],@x[$b3],$xc_,'ror#25')" );
@ret;
}
$code.=<<___;
#include "arm_arch.h"
.text
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32
#endif
#if defined(__thumb2__) || defined(__clang__)
#define ldrhsb ldrbhs
#endif
.align 5
.Lsigma:
.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
.Lone:
.long 1,0,0,0
#if __ARM_MAX_ARCH__>=7
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-.LChaCha20_ctr32
#else
.word -1
#endif
.globl ChaCha20_ctr32
.type ChaCha20_ctr32,%function
.align 5
ChaCha20_ctr32:
.LChaCha20_ctr32:
ldr r12,[sp,#0] @ pull pointer to counter and nonce
stmdb sp!,{r0-r2,r4-r11,lr}
#if __ARM_ARCH__<7 && !defined(__thumb2__)
sub r14,pc,#16 @ ChaCha20_ctr32
#else
adr r14,.LChaCha20_ctr32
#endif
#if __ARM_MAX_ARCH__>=7
cmp r2,#192 @ test len
bls .Lshort
ldr r4,[r14,#-32]
ldr r4,[r14,r4]
# ifdef __APPLE__
ldr r4,[r4]
# endif
tst r4,#1
bne .LChaCha20_neon
.Lshort:
#endif
ldmia r12,{r4-r7} @ load counter and nonce
sub sp,sp,#4*(16) @ off-load area
sub r14,r14,#64 @ .Lsigma
stmdb sp!,{r4-r7} @ copy counter and nonce
ldmia r3,{r4-r11} @ load key
ldmia r14,{r0-r3} @ load sigma
stmdb sp!,{r4-r11} @ copy key
stmdb sp!,{r0-r3} @ copy sigma
str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
b .Loop_outer_enter
.align 4
.Loop_outer:
ldmia sp,{r0-r9} @ load key material
str @t[3],[sp,#4*(32+2)] @ save len
str r12, [sp,#4*(32+1)] @ save inp
str r14, [sp,#4*(32+0)] @ save out
.Loop_outer_enter:
ldr @t[3], [sp,#4*(15)]
ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
ldr @t[2], [sp,#4*(13)]
ldr @x[14],[sp,#4*(14)]
str @t[3], [sp,#4*(16+15)]
mov @t[3],#10
b .Loop
.align 4
.Loop:
subs @t[3],@t[3],#1
___
foreach (&ROUND(0, 4, 8,12)) { eval; }
foreach (&ROUND(0, 5,10,15)) { eval; }
$code.=<<___;
bne .Loop
ldr @t[3],[sp,#4*(32+2)] @ load len
str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
str @t[1], [sp,#4*(16+9)]
str @x[12],[sp,#4*(16+12)]
str @t[2], [sp,#4*(16+13)]
str @x[14],[sp,#4*(16+14)]
@ at this point we have first half of 512-bit result in
@ @x[0-7] and second half at sp+4*(16+8)
cmp @t[3],#64 @ done yet?
#ifdef __thumb2__
itete lo
#endif
addlo r12,sp,#4*(0) @ shortcut or ...
ldrhs r12,[sp,#4*(32+1)] @ ... load inp
addlo r14,sp,#4*(0) @ shortcut or ...
ldrhs r14,[sp,#4*(32+0)] @ ... load out
ldr @t[0],[sp,#4*(0)] @ load key material
ldr @t[1],[sp,#4*(1)]
#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
# if __ARM_ARCH__<7
orr @t[2],r12,r14
tst @t[2],#3 @ are input and output aligned?
ldr @t[2],[sp,#4*(2)]
bne .Lunaligned
cmp @t[3],#64 @ restore flags
# else
ldr @t[2],[sp,#4*(2)]
# endif
ldr @t[3],[sp,#4*(3)]
add @x[0],@x[0],@t[0] @ accumulate key material
add @x[1],@x[1],@t[1]
# ifdef __thumb2__
itt hs
# endif
ldrhs @t[0],[r12],#16 @ load input
ldrhs @t[1],[r12,#-12]
add @x[2],@x[2],@t[2]
add @x[3],@x[3],@t[3]
# ifdef __thumb2__
itt hs
# endif
ldrhs @t[2],[r12,#-8]
ldrhs @t[3],[r12,#-4]
# if __ARM_ARCH__>=6 && defined(__ARMEB__)
rev @x[0],@x[0]
rev @x[1],@x[1]
rev @x[2],@x[2]
rev @x[3],@x[3]
# endif
# ifdef __thumb2__
itt hs
# endif
eorhs @x[0],@x[0],@t[0] @ xor with input
eorhs @x[1],@x[1],@t[1]
add @t[0],sp,#4*(4)
str @x[0],[r14],#16 @ store output
# ifdef __thumb2__
itt hs
# endif
eorhs @x[2],@x[2],@t[2]
eorhs @x[3],@x[3],@t[3]
ldmia @t[0],{@t[0]-@t[3]} @ load key material
str @x[1],[r14,#-12]
str @x[2],[r14,#-8]
str @x[3],[r14,#-4]
add @x[4],@x[4],@t[0] @ accumulate key material
add @x[5],@x[5],@t[1]
# ifdef __thumb2__
itt hs
# endif
ldrhs @t[0],[r12],#16 @ load input
ldrhs @t[1],[r12,#-12]
add @x[6],@x[6],@t[2]
add @x[7],@x[7],@t[3]
# ifdef __thumb2__
itt hs
# endif
ldrhs @t[2],[r12,#-8]
ldrhs @t[3],[r12,#-4]
# if __ARM_ARCH__>=6 && defined(__ARMEB__)
rev @x[4],@x[4]
rev @x[5],@x[5]
rev @x[6],@x[6]
rev @x[7],@x[7]
# endif
# ifdef __thumb2__
itt hs
# endif
eorhs @x[4],@x[4],@t[0]
eorhs @x[5],@x[5],@t[1]
add @t[0],sp,#4*(8)
str @x[4],[r14],#16 @ store output
# ifdef __thumb2__
itt hs
# endif
eorhs @x[6],@x[6],@t[2]
eorhs @x[7],@x[7],@t[3]
str @x[5],[r14,#-12]
ldmia @t[0],{@t[0]-@t[3]} @ load key material
str @x[6],[r14,#-8]
add @x[0],sp,#4*(16+8)
str @x[7],[r14,#-4]
ldmia @x[0],{@x[0]-@x[7]} @ load second half
add @x[0],@x[0],@t[0] @ accumulate key material
add @x[1],@x[1],@t[1]
# ifdef __thumb2__
itt hs
# endif
ldrhs @t[0],[r12],#16 @ load input
ldrhs @t[1],[r12,#-12]
# ifdef __thumb2__
itt hi
# endif
strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
add @x[2],@x[2],@t[2]
add @x[3],@x[3],@t[3]
# ifdef __thumb2__
itt hs
# endif
ldrhs @t[2],[r12,#-8]
ldrhs @t[3],[r12,#-4]
# if __ARM_ARCH__>=6 && defined(__ARMEB__)
rev @x[0],@x[0]
rev @x[1],@x[1]
rev @x[2],@x[2]
rev @x[3],@x[3]
# endif
# ifdef __thumb2__
itt hs
# endif
eorhs @x[0],@x[0],@t[0]
eorhs @x[1],@x[1],@t[1]
add @t[0],sp,#4*(12)
str @x[0],[r14],#16 @ store output
# ifdef __thumb2__
itt hs
# endif
eorhs @x[2],@x[2],@t[2]
eorhs @x[3],@x[3],@t[3]
str @x[1],[r14,#-12]
ldmia @t[0],{@t[0]-@t[3]} @ load key material
str @x[2],[r14,#-8]
str @x[3],[r14,#-4]
add @x[4],@x[4],@t[0] @ accumulate key material
add @x[5],@x[5],@t[1]
# ifdef __thumb2__
itt hi
# endif
addhi @t[0],@t[0],#1 @ next counter value
strhi @t[0],[sp,#4*(12)] @ save next counter value
# ifdef __thumb2__
itt hs
# endif
ldrhs @t[0],[r12],#16 @ load input
ldrhs @t[1],[r12,#-12]
add @x[6],@x[6],@t[2]
add @x[7],@x[7],@t[3]
# ifdef __thumb2__
itt hs
# endif
ldrhs @t[2],[r12,#-8]
ldrhs @t[3],[r12,#-4]
# if __ARM_ARCH__>=6 && defined(__ARMEB__)
rev @x[4],@x[4]
rev @x[5],@x[5]
rev @x[6],@x[6]
rev @x[7],@x[7]
# endif
# ifdef __thumb2__
itt hs
# endif
eorhs @x[4],@x[4],@t[0]
eorhs @x[5],@x[5],@t[1]
# ifdef __thumb2__
it hi
# endif
ldrhi @t[0],[sp,#4*(32+2)] @ re-load len
# ifdef __thumb2__
itt hs
# endif
eorhs @x[6],@x[6],@t[2]
eorhs @x[7],@x[7],@t[3]
str @x[4],[r14],#16 @ store output
str @x[5],[r14,#-12]
# ifdef __thumb2__
it hs
# endif
subhs @t[3],@t[0],#64 @ len-=64
str @x[6],[r14,#-8]
str @x[7],[r14,#-4]
bhi .Loop_outer
beq .Ldone
# if __ARM_ARCH__<7
b .Ltail
.align 4
.Lunaligned: @ unaligned endian-neutral path
cmp @t[3],#64 @ restore flags
# endif
#endif
#if __ARM_ARCH__<7
ldr @t[3],[sp,#4*(3)]
___
for ($i=0;$i<16;$i+=4) {
my $j=$i&0x7;
$code.=<<___ if ($i==4);
add @x[0],sp,#4*(16+8)
___
$code.=<<___ if ($i==8);
ldmia @x[0],{@x[0]-@x[7]} @ load second half
# ifdef __thumb2__
itt hi
# endif
strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]"
strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]"
___
$code.=<<___;
add @x[$j+0],@x[$j+0],@t[0] @ accumulate key material
___
$code.=<<___ if ($i==12);
# ifdef __thumb2__
itt hi
# endif
addhi @t[0],@t[0],#1 @ next counter value
strhi @t[0],[sp,#4*(12)] @ save next counter value
___
$code.=<<___;
add @x[$j+1],@x[$j+1],@t[1]
add @x[$j+2],@x[$j+2],@t[2]
# ifdef __thumb2__
itete lo
# endif
eorlo @t[0],@t[0],@t[0] @ zero or ...
ldrhsb @t[0],[r12],#16 @ ... load input
eorlo @t[1],@t[1],@t[1]
ldrhsb @t[1],[r12,#-12]
add @x[$j+3],@x[$j+3],@t[3]
# ifdef __thumb2__
itete lo
# endif
eorlo @t[2],@t[2],@t[2]
ldrhsb @t[2],[r12,#-8]
eorlo @t[3],@t[3],@t[3]
ldrhsb @t[3],[r12,#-4]
eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero)
eor @x[$j+1],@t[1],@x[$j+1]
# ifdef __thumb2__
itt hs
# endif
ldrhsb @t[0],[r12,#-15] @ load more input
ldrhsb @t[1],[r12,#-11]
eor @x[$j+2],@t[2],@x[$j+2]
strb @x[$j+0],[r14],#16 @ store output
eor @x[$j+3],@t[3],@x[$j+3]
# ifdef __thumb2__
itt hs
# endif
ldrhsb @t[2],[r12,#-7]
ldrhsb @t[3],[r12,#-3]
strb @x[$j+1],[r14,#-12]
eor @x[$j+0],@t[0],@x[$j+0],lsr#8
strb @x[$j+2],[r14,#-8]
eor @x[$j+1],@t[1],@x[$j+1],lsr#8
# ifdef __thumb2__
itt hs
# endif
ldrhsb @t[0],[r12,#-14] @ load more input
ldrhsb @t[1],[r12,#-10]
strb @x[$j+3],[r14,#-4]
eor @x[$j+2],@t[2],@x[$j+2],lsr#8
strb @x[$j+0],[r14,#-15]
eor @x[$j+3],@t[3],@x[$j+3],lsr#8
# ifdef __thumb2__
itt hs
# endif
ldrhsb @t[2],[r12,#-6]
ldrhsb @t[3],[r12,#-2]
strb @x[$j+1],[r14,#-11]
eor @x[$j+0],@t[0],@x[$j+0],lsr#8
strb @x[$j+2],[r14,#-7]
eor @x[$j+1],@t[1],@x[$j+1],lsr#8
# ifdef __thumb2__
itt hs
# endif
ldrhsb @t[0],[r12,#-13] @ load more input
ldrhsb @t[1],[r12,#-9]
strb @x[$j+3],[r14,#-3]
eor @x[$j+2],@t[2],@x[$j+2],lsr#8
strb @x[$j+0],[r14,#-14]
eor @x[$j+3],@t[3],@x[$j+3],lsr#8
# ifdef __thumb2__
itt hs
# endif
ldrhsb @t[2],[r12,#-5]
ldrhsb @t[3],[r12,#-1]
strb @x[$j+1],[r14,#-10]
strb @x[$j+2],[r14,#-6]
eor @x[$j+0],@t[0],@x[$j+0],lsr#8
strb @x[$j+3],[r14,#-2]
eor @x[$j+1],@t[1],@x[$j+1],lsr#8
strb @x[$j+0],[r14,#-13]
eor @x[$j+2],@t[2],@x[$j+2],lsr#8
strb @x[$j+1],[r14,#-9]
eor @x[$j+3],@t[3],@x[$j+3],lsr#8
strb @x[$j+2],[r14,#-5]
strb @x[$j+3],[r14,#-1]
___
$code.=<<___ if ($i<12);
add @t[0],sp,#4*(4+$i)
ldmia @t[0],{@t[0]-@t[3]} @ load key material
___
}
$code.=<<___;
# ifdef __thumb2__
it hi
# endif
ldrhi @t[0],[sp,#4*(32+2)] @ re-load len
# ifdef __thumb2__
it hs
# endif
subhs @t[3],@t[0],#64 @ len-=64
bhi .Loop_outer
beq .Ldone
#endif
.Ltail:
ldr r12,[sp,#4*(32+1)] @ load inp
add @t[2],sp,#4*(0)
ldr r14,[sp,#4*(32+0)] @ load out
.Loop_tail:
ldrb @t[0],[@t[2]],#1 @ read buffer on stack
ldrb @t[1],[r12],#1 @ read input
subs @t[3],@t[3],#1
eor @t[0],@t[0],@t[1]
strb @t[0],[r14],#1 @ store output
bne .Loop_tail
.Ldone:
add sp,sp,#4*(32+3)
ldmia sp!,{r4-r11,pc}
.size ChaCha20_ctr32,.-ChaCha20_ctr32
___
{{{
my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
map("q$_",(0..15));
sub NEONROUND {
my $odd = pop;
my ($a,$b,$c,$d,$t)=@_;
(
"&vadd_i32 ($a,$a,$b)",
"&veor ($d,$d,$a)",
"&vrev32_16 ($d,$d)", # vrot ($d,16)
"&vadd_i32 ($c,$c,$d)",
"&veor ($t,$b,$c)",
"&vshr_u32 ($b,$t,20)",
"&vsli_32 ($b,$t,12)",
"&vadd_i32 ($a,$a,$b)",
"&veor ($t,$d,$a)",
"&vshr_u32 ($d,$t,24)",
"&vsli_32 ($d,$t,8)",
"&vadd_i32 ($c,$c,$d)",
"&veor ($t,$b,$c)",
"&vshr_u32 ($b,$t,25)",
"&vsli_32 ($b,$t,7)",
"&vext_8 ($c,$c,$c,8)",
"&vext_8 ($b,$b,$b,$odd?12:4)",
"&vext_8 ($d,$d,$d,$odd?4:12)"
);
}
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.type ChaCha20_neon,%function
.align 5
ChaCha20_neon:
ldr r12,[sp,#0] @ pull pointer to counter and nonce
stmdb sp!,{r0-r2,r4-r11,lr}
.LChaCha20_neon:
adr r14,.Lsigma
vstmdb sp!,{d8-d15} @ ABI spec says so
stmdb sp!,{r0-r3}
vld1.32 {$b0-$c0},[r3] @ load key
ldmia r3,{r4-r11} @ load key
sub sp,sp,#4*(16+16)
vld1.32 {$d0},[r12] @ load counter and nonce
add r12,sp,#4*8
ldmia r14,{r0-r3} @ load sigma
vld1.32 {$a0},[r14]! @ load sigma
vld1.32 {$t0},[r14] @ one
vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce
vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key
str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
vshl.i32 $t1#lo,$t0#lo,#1 @ two
vstr $t0#lo,[sp,#4*(16+0)]
vshl.i32 $t2#lo,$t0#lo,#2 @ four
vstr $t1#lo,[sp,#4*(16+2)]
vmov $a1,$a0
vstr $t2#lo,[sp,#4*(16+4)]
vmov $a2,$a0
vmov $b1,$b0
vmov $b2,$b0
b .Loop_neon_enter
.align 4
.Loop_neon_outer:
ldmia sp,{r0-r9} @ load key material
cmp @t[3],#64*2 @ if len<=64*2
bls .Lbreak_neon @ switch to integer-only
vmov $a1,$a0
str @t[3],[sp,#4*(32+2)] @ save len
vmov $a2,$a0
str r12, [sp,#4*(32+1)] @ save inp
vmov $b1,$b0
str r14, [sp,#4*(32+0)] @ save out
vmov $b2,$b0
.Loop_neon_enter:
ldr @t[3], [sp,#4*(15)]
vadd.i32 $d1,$d0,$t0 @ counter+1
ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
vmov $c1,$c0
ldr @t[2], [sp,#4*(13)]
vmov $c2,$c0
ldr @x[14],[sp,#4*(14)]
vadd.i32 $d2,$d1,$t0 @ counter+2
str @t[3], [sp,#4*(16+15)]
mov @t[3],#10
add @x[12],@x[12],#3 @ counter+3
b .Loop_neon
.align 4
.Loop_neon:
subs @t[3],@t[3],#1
___
my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
my @thread3=&ROUND(0,4,8,12);
foreach (@thread0) {
eval; eval(shift(@thread3));
eval(shift(@thread1)); eval(shift(@thread3));
eval(shift(@thread2)); eval(shift(@thread3));
}
@thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
@thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
@thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
@thread3=&ROUND(0,5,10,15);
foreach (@thread0) {
eval; eval(shift(@thread3));
eval(shift(@thread1)); eval(shift(@thread3));
eval(shift(@thread2)); eval(shift(@thread3));
}
$code.=<<___;
bne .Loop_neon
add @t[3],sp,#32
vld1.32 {$t0-$t1},[sp] @ load key material
vld1.32 {$t2-$t3},[@t[3]]
ldr @t[3],[sp,#4*(32+2)] @ load len
str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
str @t[1], [sp,#4*(16+9)]
str @x[12],[sp,#4*(16+12)]
str @t[2], [sp,#4*(16+13)]
str @x[14],[sp,#4*(16+14)]
@ at this point we have first half of 512-bit result in
@ @x[0-7] and second half at sp+4*(16+8)
ldr r12,[sp,#4*(32+1)] @ load inp
ldr r14,[sp,#4*(32+0)] @ load out
vadd.i32 $a0,$a0,$t0 @ accumulate key material
vadd.i32 $a1,$a1,$t0
vadd.i32 $a2,$a2,$t0
vldr $t0#lo,[sp,#4*(16+0)] @ one
vadd.i32 $b0,$b0,$t1
vadd.i32 $b1,$b1,$t1
vadd.i32 $b2,$b2,$t1
vldr $t1#lo,[sp,#4*(16+2)] @ two
vadd.i32 $c0,$c0,$t2
vadd.i32 $c1,$c1,$t2
vadd.i32 $c2,$c2,$t2
vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1
vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2
vadd.i32 $d0,$d0,$t3
vadd.i32 $d1,$d1,$t3
vadd.i32 $d2,$d2,$t3
cmp @t[3],#64*4
blo .Ltail_neon
vld1.8 {$t0-$t1},[r12]! @ load input
mov @t[3],sp
vld1.8 {$t2-$t3},[r12]!
veor $a0,$a0,$t0 @ xor with input
veor $b0,$b0,$t1
vld1.8 {$t0-$t1},[r12]!
veor $c0,$c0,$t2
veor $d0,$d0,$t3
vld1.8 {$t2-$t3},[r12]!
veor $a1,$a1,$t0
vst1.8 {$a0-$b0},[r14]! @ store output
veor $b1,$b1,$t1
vld1.8 {$t0-$t1},[r12]!
veor $c1,$c1,$t2
vst1.8 {$c0-$d0},[r14]!
veor $d1,$d1,$t3
vld1.8 {$t2-$t3},[r12]!
veor $a2,$a2,$t0
vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration
veor $t0#hi,$t0#hi,$t0#hi
vldr $t0#lo,[sp,#4*(16+4)] @ four
veor $b2,$b2,$t1
vld1.32 {$c0-$d0},[@t[3]]
veor $c2,$c2,$t2
vst1.8 {$a1-$b1},[r14]!
veor $d2,$d2,$t3
vst1.8 {$c1-$d1},[r14]!
vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value
vldr $t0#lo,[sp,#4*(16+0)] @ one
ldmia sp,{@t[0]-@t[3]} @ load key material
add @x[0],@x[0],@t[0] @ accumulate key material
ldr @t[0],[r12],#16 @ load input
vst1.8 {$a2-$b2},[r14]!
add @x[1],@x[1],@t[1]
ldr @t[1],[r12,#-12]
vst1.8 {$c2-$d2},[r14]!
add @x[2],@x[2],@t[2]
ldr @t[2],[r12,#-8]
add @x[3],@x[3],@t[3]
ldr @t[3],[r12,#-4]
# ifdef __ARMEB__
rev @x[0],@x[0]
rev @x[1],@x[1]
rev @x[2],@x[2]
rev @x[3],@x[3]
# endif
eor @x[0],@x[0],@t[0] @ xor with input
add @t[0],sp,#4*(4)
eor @x[1],@x[1],@t[1]
str @x[0],[r14],#16 @ store output
eor @x[2],@x[2],@t[2]
str @x[1],[r14,#-12]
eor @x[3],@x[3],@t[3]
ldmia @t[0],{@t[0]-@t[3]} @ load key material
str @x[2],[r14,#-8]
str @x[3],[r14,#-4]
add @x[4],@x[4],@t[0] @ accumulate key material
ldr @t[0],[r12],#16 @ load input
add @x[5],@x[5],@t[1]
ldr @t[1],[r12,#-12]
add @x[6],@x[6],@t[2]
ldr @t[2],[r12,#-8]
add @x[7],@x[7],@t[3]
ldr @t[3],[r12,#-4]
# ifdef __ARMEB__
rev @x[4],@x[4]
rev @x[5],@x[5]
rev @x[6],@x[6]
rev @x[7],@x[7]
# endif
eor @x[4],@x[4],@t[0]
add @t[0],sp,#4*(8)
eor @x[5],@x[5],@t[1]
str @x[4],[r14],#16 @ store output
eor @x[6],@x[6],@t[2]
str @x[5],[r14,#-12]
eor @x[7],@x[7],@t[3]
ldmia @t[0],{@t[0]-@t[3]} @ load key material
str @x[6],[r14,#-8]
add @x[0],sp,#4*(16+8)
str @x[7],[r14,#-4]
ldmia @x[0],{@x[0]-@x[7]} @ load second half
add @x[0],@x[0],@t[0] @ accumulate key material
ldr @t[0],[r12],#16 @ load input
add @x[1],@x[1],@t[1]
ldr @t[1],[r12,#-12]
# ifdef __thumb2__
it hi
# endif
strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
add @x[2],@x[2],@t[2]
ldr @t[2],[r12,#-8]
# ifdef __thumb2__
it hi
# endif
strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
add @x[3],@x[3],@t[3]
ldr @t[3],[r12,#-4]
# ifdef __ARMEB__
rev @x[0],@x[0]
rev @x[1],@x[1]
rev @x[2],@x[2]
rev @x[3],@x[3]
# endif
eor @x[0],@x[0],@t[0]
add @t[0],sp,#4*(12)
eor @x[1],@x[1],@t[1]
str @x[0],[r14],#16 @ store output
eor @x[2],@x[2],@t[2]
str @x[1],[r14,#-12]
eor @x[3],@x[3],@t[3]
ldmia @t[0],{@t[0]-@t[3]} @ load key material
str @x[2],[r14,#-8]
str @x[3],[r14,#-4]
add @x[4],@x[4],@t[0] @ accumulate key material
add @t[0],@t[0],#4 @ next counter value
add @x[5],@x[5],@t[1]
str @t[0],[sp,#4*(12)] @ save next counter value
ldr @t[0],[r12],#16 @ load input
add @x[6],@x[6],@t[2]
add @x[4],@x[4],#3 @ counter+3
ldr @t[1],[r12,#-12]
add @x[7],@x[7],@t[3]
ldr @t[2],[r12,#-8]
ldr @t[3],[r12,#-4]
# ifdef __ARMEB__
rev @x[4],@x[4]
rev @x[5],@x[5]
rev @x[6],@x[6]
rev @x[7],@x[7]
# endif
eor @x[4],@x[4],@t[0]
# ifdef __thumb2__
it hi
# endif
ldrhi @t[0],[sp,#4*(32+2)] @ re-load len
eor @x[5],@x[5],@t[1]
eor @x[6],@x[6],@t[2]
str @x[4],[r14],#16 @ store output
eor @x[7],@x[7],@t[3]
str @x[5],[r14,#-12]
sub @t[3],@t[0],#64*4 @ len-=64*4
str @x[6],[r14,#-8]
str @x[7],[r14,#-4]
bhi .Loop_neon_outer
b .Ldone_neon
.align 4
.Lbreak_neon:
@ harmonize NEON and integer-only stack frames: load data
@ from NEON frame, but save to integer-only one; distance
@ between the two is 4*(32+4+16-32)=4*(20).
str @t[3], [sp,#4*(20+32+2)] @ save len
add @t[3],sp,#4*(32+4)
str r12, [sp,#4*(20+32+1)] @ save inp
str r14, [sp,#4*(20+32+0)] @ save out
ldr @x[12],[sp,#4*(16+10)]
ldr @x[14],[sp,#4*(16+11)]
vldmia @t[3],{d8-d15} @ fulfill ABI requirement
str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]"
str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]"
ldr @t[3], [sp,#4*(15)]
ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
ldr @t[2], [sp,#4*(13)]
ldr @x[14],[sp,#4*(14)]
str @t[3], [sp,#4*(20+16+15)]
add @t[3],sp,#4*(20)
vst1.32 {$a0-$b0},[@t[3]]! @ copy key
add sp,sp,#4*(20) @ switch frame
vst1.32 {$c0-$d0},[@t[3]]
mov @t[3],#10
b .Loop @ go integer-only
.align 4
.Ltail_neon:
cmp @t[3],#64*3
bhs .L192_or_more_neon
cmp @t[3],#64*2
bhs .L128_or_more_neon
cmp @t[3],#64*1
bhs .L64_or_more_neon
add @t[0],sp,#4*(8)
vst1.8 {$a0-$b0},[sp]
add @t[2],sp,#4*(0)
vst1.8 {$c0-$d0},[@t[0]]
b .Loop_tail_neon
.align 4
.L64_or_more_neon:
vld1.8 {$t0-$t1},[r12]!
vld1.8 {$t2-$t3},[r12]!
veor $a0,$a0,$t0
veor $b0,$b0,$t1
veor $c0,$c0,$t2
veor $d0,$d0,$t3
vst1.8 {$a0-$b0},[r14]!
vst1.8 {$c0-$d0},[r14]!
beq .Ldone_neon
add @t[0],sp,#4*(8)
vst1.8 {$a1-$b1},[sp]
add @t[2],sp,#4*(0)
vst1.8 {$c1-$d1},[@t[0]]
sub @t[3],@t[3],#64*1 @ len-=64*1
b .Loop_tail_neon
.align 4
.L128_or_more_neon:
vld1.8 {$t0-$t1},[r12]!
vld1.8 {$t2-$t3},[r12]!
veor $a0,$a0,$t0
veor $b0,$b0,$t1
vld1.8 {$t0-$t1},[r12]!
veor $c0,$c0,$t2
veor $d0,$d0,$t3
vld1.8 {$t2-$t3},[r12]!
veor $a1,$a1,$t0
veor $b1,$b1,$t1
vst1.8 {$a0-$b0},[r14]!
veor $c1,$c1,$t2
vst1.8 {$c0-$d0},[r14]!
veor $d1,$d1,$t3
vst1.8 {$a1-$b1},[r14]!
vst1.8 {$c1-$d1},[r14]!
beq .Ldone_neon
add @t[0],sp,#4*(8)
vst1.8 {$a2-$b2},[sp]
add @t[2],sp,#4*(0)
vst1.8 {$c2-$d2},[@t[0]]
sub @t[3],@t[3],#64*2 @ len-=64*2
b .Loop_tail_neon
.align 4
.L192_or_more_neon:
vld1.8 {$t0-$t1},[r12]!
vld1.8 {$t2-$t3},[r12]!
veor $a0,$a0,$t0
veor $b0,$b0,$t1
vld1.8 {$t0-$t1},[r12]!
veor $c0,$c0,$t2
veor $d0,$d0,$t3
vld1.8 {$t2-$t3},[r12]!
veor $a1,$a1,$t0
veor $b1,$b1,$t1
vld1.8 {$t0-$t1},[r12]!
veor $c1,$c1,$t2
vst1.8 {$a0-$b0},[r14]!
veor $d1,$d1,$t3
vld1.8 {$t2-$t3},[r12]!
veor $a2,$a2,$t0
vst1.8 {$c0-$d0},[r14]!
veor $b2,$b2,$t1
vst1.8 {$a1-$b1},[r14]!
veor $c2,$c2,$t2
vst1.8 {$c1-$d1},[r14]!
veor $d2,$d2,$t3
vst1.8 {$a2-$b2},[r14]!
vst1.8 {$c2-$d2},[r14]!
beq .Ldone_neon
ldmia sp,{@t[0]-@t[3]} @ load key material
add @x[0],@x[0],@t[0] @ accumulate key material
add @t[0],sp,#4*(4)
add @x[1],@x[1],@t[1]
add @x[2],@x[2],@t[2]
add @x[3],@x[3],@t[3]
ldmia @t[0],{@t[0]-@t[3]} @ load key material
add @x[4],@x[4],@t[0] @ accumulate key material
add @t[0],sp,#4*(8)
add @x[5],@x[5],@t[1]
add @x[6],@x[6],@t[2]
add @x[7],@x[7],@t[3]
ldmia @t[0],{@t[0]-@t[3]} @ load key material
# ifdef __ARMEB__
rev @x[0],@x[0]
rev @x[1],@x[1]
rev @x[2],@x[2]
rev @x[3],@x[3]
rev @x[4],@x[4]
rev @x[5],@x[5]
rev @x[6],@x[6]
rev @x[7],@x[7]
# endif
stmia sp,{@x[0]-@x[7]}
add @x[0],sp,#4*(16+8)
ldmia @x[0],{@x[0]-@x[7]} @ load second half
add @x[0],@x[0],@t[0] @ accumulate key material
add @t[0],sp,#4*(12)
add @x[1],@x[1],@t[1]
add @x[2],@x[2],@t[2]
add @x[3],@x[3],@t[3]
ldmia @t[0],{@t[0]-@t[3]} @ load key material
add @x[4],@x[4],@t[0] @ accumulate key material
add @t[0],sp,#4*(8)
add @x[5],@x[5],@t[1]
add @x[4],@x[4],#3 @ counter+3
add @x[6],@x[6],@t[2]
add @x[7],@x[7],@t[3]
ldr @t[3],[sp,#4*(32+2)] @ re-load len
# ifdef __ARMEB__
rev @x[0],@x[0]
rev @x[1],@x[1]
rev @x[2],@x[2]
rev @x[3],@x[3]
rev @x[4],@x[4]
rev @x[5],@x[5]
rev @x[6],@x[6]
rev @x[7],@x[7]
# endif
stmia @t[0],{@x[0]-@x[7]}
add @t[2],sp,#4*(0)
sub @t[3],@t[0],#64*3 @ len-=64*3
.Loop_tail_neon:
ldrb @t[0],[@t[2]],#1 @ read buffer on stack
ldrb @t[1],[r12],#1 @ read input
subs @t[3],@t[3],#1
eor @t[0],@t[0],@t[1]
strb @t[0],[r14],#1 @ store ouput
bne .Loop_tail_neon
.Ldone_neon:
add sp,sp,#4*(32+4)
vldmia sp,{d8-d15}
add sp,sp,#4*(16+3)
ldmia sp!,{r4-r11,pc}
.size ChaCha20_neon,.-ChaCha20_neon
.comm OPENSSL_armcap_P,4,4
#endif
___
}}}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
print $_,"\n";
}
close STDOUT;
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# June 2015
#
# ChaCha20 for ARMv8.
#
# Performance in cycles per byte out of large buffer.
#
# IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU
#
# Apple A7 5.50/+49% 3.33 1.70
# Cortex-A53 8.40/+80% 4.72 4.72(*)
# Cortex-A57 8.06/+43% 4.90 4.43(**)
# Denver 4.50/+82% 2.63 2.67(*)
# X-Gene 9.50/+46% 8.82 8.89(*)
#
# (*) it's expected that doubling interleave factor doesn't help
# all processors, only those with higher NEON latency and
# higher instruction issue rate;
# (**) expected improvement was actually higher;
$flavour=shift;
$output=shift;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
sub AUTOLOAD() # thunk [simplified] x86-style perlasm
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
my $arg = pop;
$arg = "#$arg" if ($arg*1 eq $arg);
$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
}
my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
my @x=map("x$_",(5..17,19..21));
my @d=map("x$_",(22..28,30));
sub ROUND {
my ($a0,$b0,$c0,$d0)=@_;
my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
(
"&add_32 (@x[$a0],@x[$a0],@x[$b0])",
"&add_32 (@x[$a1],@x[$a1],@x[$b1])",
"&add_32 (@x[$a2],@x[$a2],@x[$b2])",
"&add_32 (@x[$a3],@x[$a3],@x[$b3])",
"&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
"&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
"&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
"&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
"&ror_32 (@x[$d0],@x[$d0],16)",
"&ror_32 (@x[$d1],@x[$d1],16)",
"&ror_32 (@x[$d2],@x[$d2],16)",
"&ror_32 (@x[$d3],@x[$d3],16)",
"&add_32 (@x[$c0],@x[$c0],@x[$d0])",
"&add_32 (@x[$c1],@x[$c1],@x[$d1])",
"&add_32 (@x[$c2],@x[$c2],@x[$d2])",
"&add_32 (@x[$c3],@x[$c3],@x[$d3])",
"&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
"&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
"&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
"&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
"&ror_32 (@x[$b0],@x[$b0],20)",
"&ror_32 (@x[$b1],@x[$b1],20)",
"&ror_32 (@x[$b2],@x[$b2],20)",
"&ror_32 (@x[$b3],@x[$b3],20)",
"&add_32 (@x[$a0],@x[$a0],@x[$b0])",
"&add_32 (@x[$a1],@x[$a1],@x[$b1])",
"&add_32 (@x[$a2],@x[$a2],@x[$b2])",
"&add_32 (@x[$a3],@x[$a3],@x[$b3])",
"&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
"&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
"&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
"&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
"&ror_32 (@x[$d0],@x[$d0],24)",
"&ror_32 (@x[$d1],@x[$d1],24)",
"&ror_32 (@x[$d2],@x[$d2],24)",
"&ror_32 (@x[$d3],@x[$d3],24)",
"&add_32 (@x[$c0],@x[$c0],@x[$d0])",
"&add_32 (@x[$c1],@x[$c1],@x[$d1])",
"&add_32 (@x[$c2],@x[$c2],@x[$d2])",
"&add_32 (@x[$c3],@x[$c3],@x[$d3])",
"&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
"&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
"&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
"&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
"&ror_32 (@x[$b0],@x[$b0],25)",
"&ror_32 (@x[$b1],@x[$b1],25)",
"&ror_32 (@x[$b2],@x[$b2],25)",
"&ror_32 (@x[$b3],@x[$b3],25)"
);
}
$code.=<<___;
#include "arm_arch.h"
.text
.extern OPENSSL_armcap_P
.align 5
.Lsigma:
.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
.Lone:
.long 1,0,0,0
.LOPENSSL_armcap_P:
#ifdef __ILP32__
.long OPENSSL_armcap_P-.
#else
.quad OPENSSL_armcap_P-.
#endif
.asciz "ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.globl ChaCha20_ctr32
.type ChaCha20_ctr32,%function
.align 5
ChaCha20_ctr32:
cbz $len,.Labort
adr @x[0],.LOPENSSL_armcap_P
cmp $len,#192
b.lo .Lshort
#ifdef __ILP32__
ldrsw @x[1],[@x[0]]
#else
ldr @x[1],[@x[0]]
#endif
ldr w17,[@x[1],@x[0]]
tst w17,#ARMV7_NEON
b.ne ChaCha20_neon
.Lshort:
stp x29,x30,[sp,#-96]!
add x29,sp,#0
adr @x[0],.Lsigma
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
sub sp,sp,#64
ldp @d[0],@d[1],[@x[0]] // load sigma
ldp @d[2],@d[3],[$key] // load key
ldp @d[4],@d[5],[$key,#16]
ldp @d[6],@d[7],[$ctr] // load counter
#ifdef __ARMEB__
ror @d[2],@d[2],#32
ror @d[3],@d[3],#32
ror @d[4],@d[4],#32
ror @d[5],@d[5],#32
ror @d[6],@d[6],#32
ror @d[7],@d[7],#32
#endif
.Loop_outer:
mov.32 @x[0],@d[0] // unpack key block
lsr @x[1],@d[0],#32
mov.32 @x[2],@d[1]
lsr @x[3],@d[1],#32
mov.32 @x[4],@d[2]
lsr @x[5],@d[2],#32
mov.32 @x[6],@d[3]
lsr @x[7],@d[3],#32
mov.32 @x[8],@d[4]
lsr @x[9],@d[4],#32
mov.32 @x[10],@d[5]
lsr @x[11],@d[5],#32
mov.32 @x[12],@d[6]
lsr @x[13],@d[6],#32
mov.32 @x[14],@d[7]
lsr @x[15],@d[7],#32
mov $ctr,#10
subs $len,$len,#64
.Loop:
sub $ctr,$ctr,#1
___
foreach (&ROUND(0, 4, 8,12)) { eval; }
foreach (&ROUND(0, 5,10,15)) { eval; }
$code.=<<___;
cbnz $ctr,.Loop
add.32 @x[0],@x[0],@d[0] // accumulate key block
add @x[1],@x[1],@d[0],lsr#32
add.32 @x[2],@x[2],@d[1]
add @x[3],@x[3],@d[1],lsr#32
add.32 @x[4],@x[4],@d[2]
add @x[5],@x[5],@d[2],lsr#32
add.32 @x[6],@x[6],@d[3]
add @x[7],@x[7],@d[3],lsr#32
add.32 @x[8],@x[8],@d[4]
add @x[9],@x[9],@d[4],lsr#32
add.32 @x[10],@x[10],@d[5]
add @x[11],@x[11],@d[5],lsr#32
add.32 @x[12],@x[12],@d[6]
add @x[13],@x[13],@d[6],lsr#32
add.32 @x[14],@x[14],@d[7]
add @x[15],@x[15],@d[7],lsr#32
b.lo .Ltail
add @x[0],@x[0],@x[1],lsl#32 // pack
add @x[2],@x[2],@x[3],lsl#32
ldp @x[1],@x[3],[$inp,#0] // load input
add @x[4],@x[4],@x[5],lsl#32
add @x[6],@x[6],@x[7],lsl#32
ldp @x[5],@x[7],[$inp,#16]
add @x[8],@x[8],@x[9],lsl#32
add @x[10],@x[10],@x[11],lsl#32
ldp @x[9],@x[11],[$inp,#32]
add @x[12],@x[12],@x[13],lsl#32
add @x[14],@x[14],@x[15],lsl#32
ldp @x[13],@x[15],[$inp,#48]
add $inp,$inp,#64
#ifdef __ARMEB__
rev @x[0],@x[0]
rev @x[2],@x[2]
rev @x[4],@x[4]
rev @x[6],@x[6]
rev @x[8],@x[8]
rev @x[10],@x[10]
rev @x[12],@x[12]
rev @x[14],@x[14]
#endif
eor @x[0],@x[0],@x[1]
eor @x[2],@x[2],@x[3]
eor @x[4],@x[4],@x[5]
eor @x[6],@x[6],@x[7]
eor @x[8],@x[8],@x[9]
eor @x[10],@x[10],@x[11]
eor @x[12],@x[12],@x[13]
eor @x[14],@x[14],@x[15]
stp @x[0],@x[2],[$out,#0] // store output
add @d[6],@d[6],#1 // increment counter
stp @x[4],@x[6],[$out,#16]
stp @x[8],@x[10],[$out,#32]
stp @x[12],@x[14],[$out,#48]
add $out,$out,#64
b.hi .Loop_outer
ldp x19,x20,[x29,#16]
add sp,sp,#64
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
.Labort:
ret
.align 4
.Ltail:
add $len,$len,#64
.Less_than_64:
sub $out,$out,#1
add $inp,$inp,$len
add $out,$out,$len
add $ctr,sp,$len
neg $len,$len
add @x[0],@x[0],@x[1],lsl#32 // pack
add @x[2],@x[2],@x[3],lsl#32
add @x[4],@x[4],@x[5],lsl#32
add @x[6],@x[6],@x[7],lsl#32
add @x[8],@x[8],@x[9],lsl#32
add @x[10],@x[10],@x[11],lsl#32
add @x[12],@x[12],@x[13],lsl#32
add @x[14],@x[14],@x[15],lsl#32
#ifdef __ARMEB__
rev @x[0],@x[0]
rev @x[2],@x[2]
rev @x[4],@x[4]
rev @x[6],@x[6]
rev @x[8],@x[8]
rev @x[10],@x[10]
rev @x[12],@x[12]
rev @x[14],@x[14]
#endif
stp @x[0],@x[2],[sp,#0]
stp @x[4],@x[6],[sp,#16]
stp @x[8],@x[10],[sp,#32]
stp @x[12],@x[14],[sp,#48]
.Loop_tail:
ldrb w10,[$inp,$len]
ldrb w11,[$ctr,$len]
add $len,$len,#1
eor w10,w10,w11
strb w10,[$out,$len]
cbnz $len,.Loop_tail
stp xzr,xzr,[sp,#0]
stp xzr,xzr,[sp,#16]
stp xzr,xzr,[sp,#32]
stp xzr,xzr,[sp,#48]
ldp x19,x20,[x29,#16]
add sp,sp,#64
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
ret
.size ChaCha20_ctr32,.-ChaCha20_ctr32
___
{{{
my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
map("v$_.4s",(0..7,16..23));
my (@K)=map("v$_.4s",(24..30));
my $ONE="v31.4s";
sub NEONROUND {
my $odd = pop;
my ($a,$b,$c,$d,$t)=@_;
(
"&add ('$a','$a','$b')",
"&eor ('$d','$d','$a')",
"&rev32_16 ('$d','$d')", # vrot ($d,16)
"&add ('$c','$c','$d')",
"&eor ('$t','$b','$c')",
"&ushr ('$b','$t',20)",
"&sli ('$b','$t',12)",
"&add ('$a','$a','$b')",
"&eor ('$t','$d','$a')",
"&ushr ('$d','$t',24)",
"&sli ('$d','$t',8)",
"&add ('$c','$c','$d')",
"&eor ('$t','$b','$c')",
"&ushr ('$b','$t',25)",
"&sli ('$b','$t',7)",
"&ext ('$c','$c','$c',8)",
"&ext ('$d','$d','$d',$odd?4:12)",
"&ext ('$b','$b','$b',$odd?12:4)"
);
}
$code.=<<___;
.type ChaCha20_neon,%function
.align 5
ChaCha20_neon:
stp x29,x30,[sp,#-96]!
add x29,sp,#0
adr @x[0],.Lsigma
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
cmp $len,#512
b.hs .L512_or_more_neon
sub sp,sp,#64
ldp @d[0],@d[1],[@x[0]] // load sigma
ld1 {@K[0]},[@x[0]],#16
ldp @d[2],@d[3],[$key] // load key
ldp @d[4],@d[5],[$key,#16]
ld1 {@K[1],@K[2]},[$key]
ldp @d[6],@d[7],[$ctr] // load counter
ld1 {@K[3]},[$ctr]
ld1 {$ONE},[@x[0]]
#ifdef __ARMEB__
rev64 @K[0],@K[0]
ror @d[2],@d[2],#32
ror @d[3],@d[3],#32
ror @d[4],@d[4],#32
ror @d[5],@d[5],#32
ror @d[6],@d[6],#32
ror @d[7],@d[7],#32
#endif
add @K[3],@K[3],$ONE // += 1
add @K[4],@K[3],$ONE
add @K[5],@K[4],$ONE
shl $ONE,$ONE,#2 // 1 -> 4
.Loop_outer_neon:
mov.32 @x[0],@d[0] // unpack key block
lsr @x[1],@d[0],#32
mov $A0,@K[0]
mov.32 @x[2],@d[1]
lsr @x[3],@d[1],#32
mov $A1,@K[0]
mov.32 @x[4],@d[2]
lsr @x[5],@d[2],#32
mov $A2,@K[0]
mov.32 @x[6],@d[3]
mov $B0,@K[1]
lsr @x[7],@d[3],#32
mov $B1,@K[1]
mov.32 @x[8],@d[4]
mov $B2,@K[1]
lsr @x[9],@d[4],#32
mov $D0,@K[3]
mov.32 @x[10],@d[5]
mov $D1,@K[4]
lsr @x[11],@d[5],#32
mov $D2,@K[5]
mov.32 @x[12],@d[6]
mov $C0,@K[2]
lsr @x[13],@d[6],#32
mov $C1,@K[2]
mov.32 @x[14],@d[7]
mov $C2,@K[2]
lsr @x[15],@d[7],#32
mov $ctr,#10
subs $len,$len,#256
.Loop_neon:
sub $ctr,$ctr,#1
___
my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
my @thread3=&ROUND(0,4,8,12);
foreach (@thread0) {
eval; eval(shift(@thread3));
eval(shift(@thread1)); eval(shift(@thread3));
eval(shift(@thread2)); eval(shift(@thread3));
}
@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
@thread3=&ROUND(0,5,10,15);
foreach (@thread0) {
eval; eval(shift(@thread3));
eval(shift(@thread1)); eval(shift(@thread3));
eval(shift(@thread2)); eval(shift(@thread3));
}
$code.=<<___;
cbnz $ctr,.Loop_neon
add.32 @x[0],@x[0],@d[0] // accumulate key block
add $A0,$A0,@K[0]
add @x[1],@x[1],@d[0],lsr#32
add $A1,$A1,@K[0]
add.32 @x[2],@x[2],@d[1]
add $A2,$A2,@K[0]
add @x[3],@x[3],@d[1],lsr#32
add $C0,$C0,@K[2]
add.32 @x[4],@x[4],@d[2]
add $C1,$C1,@K[2]
add @x[5],@x[5],@d[2],lsr#32
add $C2,$C2,@K[2]
add.32 @x[6],@x[6],@d[3]
add $D0,$D0,@K[3]
add @x[7],@x[7],@d[3],lsr#32
add.32 @x[8],@x[8],@d[4]
add $D1,$D1,@K[4]
add @x[9],@x[9],@d[4],lsr#32
add.32 @x[10],@x[10],@d[5]
add $D2,$D2,@K[5]
add @x[11],@x[11],@d[5],lsr#32
add.32 @x[12],@x[12],@d[6]
add $B0,$B0,@K[1]
add @x[13],@x[13],@d[6],lsr#32
add.32 @x[14],@x[14],@d[7]
add $B1,$B1,@K[1]
add @x[15],@x[15],@d[7],lsr#32
add $B2,$B2,@K[1]
b.lo .Ltail_neon
add @x[0],@x[0],@x[1],lsl#32 // pack
add @x[2],@x[2],@x[3],lsl#32
ldp @x[1],@x[3],[$inp,#0] // load input
add @x[4],@x[4],@x[5],lsl#32
add @x[6],@x[6],@x[7],lsl#32
ldp @x[5],@x[7],[$inp,#16]
add @x[8],@x[8],@x[9],lsl#32
add @x[10],@x[10],@x[11],lsl#32
ldp @x[9],@x[11],[$inp,#32]
add @x[12],@x[12],@x[13],lsl#32
add @x[14],@x[14],@x[15],lsl#32
ldp @x[13],@x[15],[$inp,#48]
add $inp,$inp,#64
#ifdef __ARMEB__
rev @x[0],@x[0]
rev @x[2],@x[2]
rev @x[4],@x[4]
rev @x[6],@x[6]
rev @x[8],@x[8]
rev @x[10],@x[10]
rev @x[12],@x[12]
rev @x[14],@x[14]
#endif
ld1.8 {$T0-$T3},[$inp],#64
eor @x[0],@x[0],@x[1]
eor @x[2],@x[2],@x[3]
eor @x[4],@x[4],@x[5]
eor @x[6],@x[6],@x[7]
eor @x[8],@x[8],@x[9]
eor $A0,$A0,$T0
eor @x[10],@x[10],@x[11]
eor $B0,$B0,$T1
eor @x[12],@x[12],@x[13]
eor $C0,$C0,$T2
eor @x[14],@x[14],@x[15]
eor $D0,$D0,$T3
ld1.8 {$T0-$T3},[$inp],#64
stp @x[0],@x[2],[$out,#0] // store output
add @d[6],@d[6],#4 // increment counter
stp @x[4],@x[6],[$out,#16]
add @K[3],@K[3],$ONE // += 4
stp @x[8],@x[10],[$out,#32]
add @K[4],@K[4],$ONE
stp @x[12],@x[14],[$out,#48]
add @K[5],@K[5],$ONE
add $out,$out,#64
st1.8 {$A0-$D0},[$out],#64
ld1.8 {$A0-$D0},[$inp],#64
eor $A1,$A1,$T0
eor $B1,$B1,$T1
eor $C1,$C1,$T2
eor $D1,$D1,$T3
st1.8 {$A1-$D1},[$out],#64
eor $A2,$A2,$A0
eor $B2,$B2,$B0
eor $C2,$C2,$C0
eor $D2,$D2,$D0
st1.8 {$A2-$D2},[$out],#64
b.hi .Loop_outer_neon
ldp x19,x20,[x29,#16]
add sp,sp,#64
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
ret
.Ltail_neon:
add $len,$len,#256
cmp $len,#64
b.lo .Less_than_64
add @x[0],@x[0],@x[1],lsl#32 // pack
add @x[2],@x[2],@x[3],lsl#32
ldp @x[1],@x[3],[$inp,#0] // load input
add @x[4],@x[4],@x[5],lsl#32
add @x[6],@x[6],@x[7],lsl#32
ldp @x[5],@x[7],[$inp,#16]
add @x[8],@x[8],@x[9],lsl#32
add @x[10],@x[10],@x[11],lsl#32
ldp @x[9],@x[11],[$inp,#32]
add @x[12],@x[12],@x[13],lsl#32
add @x[14],@x[14],@x[15],lsl#32
ldp @x[13],@x[15],[$inp,#48]
add $inp,$inp,#64
#ifdef __ARMEB__
rev @x[0],@x[0]
rev @x[2],@x[2]
rev @x[4],@x[4]
rev @x[6],@x[6]
rev @x[8],@x[8]
rev @x[10],@x[10]
rev @x[12],@x[12]
rev @x[14],@x[14]
#endif
eor @x[0],@x[0],@x[1]
eor @x[2],@x[2],@x[3]
eor @x[4],@x[4],@x[5]
eor @x[6],@x[6],@x[7]
eor @x[8],@x[8],@x[9]
eor @x[10],@x[10],@x[11]
eor @x[12],@x[12],@x[13]
eor @x[14],@x[14],@x[15]
stp @x[0],@x[2],[$out,#0] // store output
add @d[6],@d[6],#4 // increment counter
stp @x[4],@x[6],[$out,#16]
stp @x[8],@x[10],[$out,#32]
stp @x[12],@x[14],[$out,#48]
add $out,$out,#64
b.eq .Ldone_neon
sub $len,$len,#64
cmp $len,#64
b.lo .Less_than_128
ld1.8 {$T0-$T3},[$inp],#64
eor $A0,$A0,$T0
eor $B0,$B0,$T1
eor $C0,$C0,$T2
eor $D0,$D0,$T3
st1.8 {$A0-$D0},[$out],#64
b.eq .Ldone_neon
sub $len,$len,#64
cmp $len,#64
b.lo .Less_than_192
ld1.8 {$T0-$T3},[$inp],#64
eor $A1,$A1,$T0
eor $B1,$B1,$T1
eor $C1,$C1,$T2
eor $D1,$D1,$T3
st1.8 {$A1-$D1},[$out],#64
b.eq .Ldone_neon
sub $len,$len,#64
st1.8 {$A2-$D2},[sp]
b .Last_neon
.Less_than_128:
st1.8 {$A0-$D0},[sp]
b .Last_neon
.Less_than_192:
st1.8 {$A1-$D1},[sp]
b .Last_neon
.align 4
.Last_neon:
sub $out,$out,#1
add $inp,$inp,$len
add $out,$out,$len
add $ctr,sp,$len
neg $len,$len
.Loop_tail_neon:
ldrb w10,[$inp,$len]
ldrb w11,[$ctr,$len]
add $len,$len,#1
eor w10,w10,w11
strb w10,[$out,$len]
cbnz $len,.Loop_tail_neon
stp xzr,xzr,[sp,#0]
stp xzr,xzr,[sp,#16]
stp xzr,xzr,[sp,#32]
stp xzr,xzr,[sp,#48]
.Ldone_neon:
ldp x19,x20,[x29,#16]
add sp,sp,#64
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
ret
.size ChaCha20_neon,.-ChaCha20_neon
___
{
my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
$A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
$code.=<<___;
.type ChaCha20_512_neon,%function
.align 5
ChaCha20_512_neon:
stp x29,x30,[sp,#-96]!
add x29,sp,#0
adr @x[0],.Lsigma
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
.L512_or_more_neon:
sub sp,sp,#128+64
ldp @d[0],@d[1],[@x[0]] // load sigma
ld1 {@K[0]},[@x[0]],#16
ldp @d[2],@d[3],[$key] // load key
ldp @d[4],@d[5],[$key,#16]
ld1 {@K[1],@K[2]},[$key]
ldp @d[6],@d[7],[$ctr] // load counter
ld1 {@K[3]},[$ctr]
ld1 {$ONE},[@x[0]]
#ifdef __ARMEB__
rev64 @K[0],@K[0]
ror @d[2],@d[2],#32
ror @d[3],@d[3],#32
ror @d[4],@d[4],#32
ror @d[5],@d[5],#32
ror @d[6],@d[6],#32
ror @d[7],@d[7],#32
#endif
add @K[3],@K[3],$ONE // += 1
stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part
add @K[3],@K[3],$ONE // not typo
str @K[2],[sp,#32]
add @K[4],@K[3],$ONE
add @K[5],@K[4],$ONE
add @K[6],@K[5],$ONE
shl $ONE,$ONE,#2 // 1 -> 4
stp d8,d9,[sp,#128+0] // meet ABI requirements
stp d10,d11,[sp,#128+16]
stp d12,d13,[sp,#128+32]
stp d14,d15,[sp,#128+48]
sub $len,$len,#512 // not typo
.Loop_outer_512_neon:
mov $A0,@K[0]
mov $A1,@K[0]
mov $A2,@K[0]
mov $A3,@K[0]
mov $A4,@K[0]
mov $A5,@K[0]
mov $B0,@K[1]
mov.32 @x[0],@d[0] // unpack key block
mov $B1,@K[1]
lsr @x[1],@d[0],#32
mov $B2,@K[1]
mov.32 @x[2],@d[1]
mov $B3,@K[1]
lsr @x[3],@d[1],#32
mov $B4,@K[1]
mov.32 @x[4],@d[2]
mov $B5,@K[1]
lsr @x[5],@d[2],#32
mov $D0,@K[3]
mov.32 @x[6],@d[3]
mov $D1,@K[4]
lsr @x[7],@d[3],#32
mov $D2,@K[5]
mov.32 @x[8],@d[4]
mov $D3,@K[6]
lsr @x[9],@d[4],#32
mov $C0,@K[2]
mov.32 @x[10],@d[5]
mov $C1,@K[2]
lsr @x[11],@d[5],#32
add $D4,$D0,$ONE // +4
mov.32 @x[12],@d[6]
add $D5,$D1,$ONE // +4
lsr @x[13],@d[6],#32
mov $C2,@K[2]
mov.32 @x[14],@d[7]
mov $C3,@K[2]
lsr @x[15],@d[7],#32
mov $C4,@K[2]
stp @K[3],@K[4],[sp,#48] // off-load key block, variable part
mov $C5,@K[2]
str @K[5],[sp,#80]
mov $ctr,#5
subs $len,$len,#512
.Loop_upper_neon:
sub $ctr,$ctr,#1
___
my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
my $diff = ($#thread0+1)*6 - $#thread67 - 1;
my $i = 0;
foreach (@thread0) {
eval; eval(shift(@thread67));
eval(shift(@thread1)); eval(shift(@thread67));
eval(shift(@thread2)); eval(shift(@thread67));
eval(shift(@thread3)); eval(shift(@thread67));
eval(shift(@thread4)); eval(shift(@thread67));
eval(shift(@thread5)); eval(shift(@thread67));
}
@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
foreach (@thread0) {
eval; eval(shift(@thread67));
eval(shift(@thread1)); eval(shift(@thread67));
eval(shift(@thread2)); eval(shift(@thread67));
eval(shift(@thread3)); eval(shift(@thread67));
eval(shift(@thread4)); eval(shift(@thread67));
eval(shift(@thread5)); eval(shift(@thread67));
}
$code.=<<___;
cbnz $ctr,.Loop_upper_neon
add.32 @x[0],@x[0],@d[0] // accumulate key block
add @x[1],@x[1],@d[0],lsr#32
add.32 @x[2],@x[2],@d[1]
add @x[3],@x[3],@d[1],lsr#32
add.32 @x[4],@x[4],@d[2]
add @x[5],@x[5],@d[2],lsr#32
add.32 @x[6],@x[6],@d[3]
add @x[7],@x[7],@d[3],lsr#32
add.32 @x[8],@x[8],@d[4]
add @x[9],@x[9],@d[4],lsr#32
add.32 @x[10],@x[10],@d[5]
add @x[11],@x[11],@d[5],lsr#32
add.32 @x[12],@x[12],@d[6]
add @x[13],@x[13],@d[6],lsr#32
add.32 @x[14],@x[14],@d[7]
add @x[15],@x[15],@d[7],lsr#32
add @x[0],@x[0],@x[1],lsl#32 // pack
add @x[2],@x[2],@x[3],lsl#32
ldp @x[1],@x[3],[$inp,#0] // load input
add @x[4],@x[4],@x[5],lsl#32
add @x[6],@x[6],@x[7],lsl#32
ldp @x[5],@x[7],[$inp,#16]
add @x[8],@x[8],@x[9],lsl#32
add @x[10],@x[10],@x[11],lsl#32
ldp @x[9],@x[11],[$inp,#32]
add @x[12],@x[12],@x[13],lsl#32
add @x[14],@x[14],@x[15],lsl#32
ldp @x[13],@x[15],[$inp,#48]
add $inp,$inp,#64
#ifdef __ARMEB__
rev @x[0],@x[0]
rev @x[2],@x[2]
rev @x[4],@x[4]
rev @x[6],@x[6]
rev @x[8],@x[8]
rev @x[10],@x[10]
rev @x[12],@x[12]
rev @x[14],@x[14]
#endif
eor @x[0],@x[0],@x[1]
eor @x[2],@x[2],@x[3]
eor @x[4],@x[4],@x[5]
eor @x[6],@x[6],@x[7]
eor @x[8],@x[8],@x[9]
eor @x[10],@x[10],@x[11]
eor @x[12],@x[12],@x[13]
eor @x[14],@x[14],@x[15]
stp @x[0],@x[2],[$out,#0] // store output
add @d[6],@d[6],#1 // increment counter
mov.32 @x[0],@d[0] // unpack key block
lsr @x[1],@d[0],#32
stp @x[4],@x[6],[$out,#16]
mov.32 @x[2],@d[1]
lsr @x[3],@d[1],#32
stp @x[8],@x[10],[$out,#32]
mov.32 @x[4],@d[2]
lsr @x[5],@d[2],#32
stp @x[12],@x[14],[$out,#48]
add $out,$out,#64
mov.32 @x[6],@d[3]
lsr @x[7],@d[3],#32
mov.32 @x[8],@d[4]
lsr @x[9],@d[4],#32
mov.32 @x[10],@d[5]
lsr @x[11],@d[5],#32
mov.32 @x[12],@d[6]
lsr @x[13],@d[6],#32
mov.32 @x[14],@d[7]
lsr @x[15],@d[7],#32
mov $ctr,#5
.Loop_lower_neon:
sub $ctr,$ctr,#1
___
@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
foreach (@thread0) {
eval; eval(shift(@thread67));
eval(shift(@thread1)); eval(shift(@thread67));
eval(shift(@thread2)); eval(shift(@thread67));
eval(shift(@thread3)); eval(shift(@thread67));
eval(shift(@thread4)); eval(shift(@thread67));
eval(shift(@thread5)); eval(shift(@thread67));
}
@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
foreach (@thread0) {
eval; eval(shift(@thread67));
eval(shift(@thread1)); eval(shift(@thread67));
eval(shift(@thread2)); eval(shift(@thread67));
eval(shift(@thread3)); eval(shift(@thread67));
eval(shift(@thread4)); eval(shift(@thread67));
eval(shift(@thread5)); eval(shift(@thread67));
}
$code.=<<___;
cbnz $ctr,.Loop_lower_neon
add.32 @x[0],@x[0],@d[0] // accumulate key block
ldp @K[0],@K[1],[sp,#0]
add @x[1],@x[1],@d[0],lsr#32
ldp @K[2],@K[3],[sp,#32]
add.32 @x[2],@x[2],@d[1]
ldp @K[4],@K[5],[sp,#64]
add @x[3],@x[3],@d[1],lsr#32
add $A0,$A0,@K[0]
add.32 @x[4],@x[4],@d[2]
add $A1,$A1,@K[0]
add @x[5],@x[5],@d[2],lsr#32
add $A2,$A2,@K[0]
add.32 @x[6],@x[6],@d[3]
add $A3,$A3,@K[0]
add @x[7],@x[7],@d[3],lsr#32
add $A4,$A4,@K[0]
add.32 @x[8],@x[8],@d[4]
add $A5,$A5,@K[0]
add @x[9],@x[9],@d[4],lsr#32
add $C0,$C0,@K[2]
add.32 @x[10],@x[10],@d[5]
add $C1,$C1,@K[2]
add @x[11],@x[11],@d[5],lsr#32
add $C2,$C2,@K[2]
add.32 @x[12],@x[12],@d[6]
add $C3,$C3,@K[2]
add @x[13],@x[13],@d[6],lsr#32
add $C4,$C4,@K[2]
add.32 @x[14],@x[14],@d[7]
add $C5,$C5,@K[2]
add @x[15],@x[15],@d[7],lsr#32
add $D4,$D4,$ONE // +4
add @x[0],@x[0],@x[1],lsl#32 // pack
add $D5,$D5,$ONE // +4
add @x[2],@x[2],@x[3],lsl#32
add $D0,$D0,@K[3]
ldp @x[1],@x[3],[$inp,#0] // load input
add $D1,$D1,@K[4]
add @x[4],@x[4],@x[5],lsl#32
add $D2,$D2,@K[5]
add @x[6],@x[6],@x[7],lsl#32
add $D3,$D3,@K[6]
ldp @x[5],@x[7],[$inp,#16]
add $D4,$D4,@K[3]
add @x[8],@x[8],@x[9],lsl#32
add $D5,$D5,@K[4]
add @x[10],@x[10],@x[11],lsl#32
add $B0,$B0,@K[1]
ldp @x[9],@x[11],[$inp,#32]
add $B1,$B1,@K[1]
add @x[12],@x[12],@x[13],lsl#32
add $B2,$B2,@K[1]
add @x[14],@x[14],@x[15],lsl#32
add $B3,$B3,@K[1]
ldp @x[13],@x[15],[$inp,#48]
add $B4,$B4,@K[1]
add $inp,$inp,#64
add $B5,$B5,@K[1]
#ifdef __ARMEB__
rev @x[0],@x[0]
rev @x[2],@x[2]
rev @x[4],@x[4]
rev @x[6],@x[6]
rev @x[8],@x[8]
rev @x[10],@x[10]
rev @x[12],@x[12]
rev @x[14],@x[14]
#endif
ld1.8 {$T0-$T3},[$inp],#64
eor @x[0],@x[0],@x[1]
eor @x[2],@x[2],@x[3]
eor @x[4],@x[4],@x[5]
eor @x[6],@x[6],@x[7]
eor @x[8],@x[8],@x[9]
eor $A0,$A0,$T0
eor @x[10],@x[10],@x[11]
eor $B0,$B0,$T1
eor @x[12],@x[12],@x[13]
eor $C0,$C0,$T2
eor @x[14],@x[14],@x[15]
eor $D0,$D0,$T3
ld1.8 {$T0-$T3},[$inp],#64
stp @x[0],@x[2],[$out,#0] // store output
add @d[6],@d[6],#7 // increment counter
stp @x[4],@x[6],[$out,#16]
stp @x[8],@x[10],[$out,#32]
stp @x[12],@x[14],[$out,#48]
add $out,$out,#64
st1.8 {$A0-$D0},[$out],#64
ld1.8 {$A0-$D0},[$inp],#64
eor $A1,$A1,$T0
eor $B1,$B1,$T1
eor $C1,$C1,$T2
eor $D1,$D1,$T3
st1.8 {$A1-$D1},[$out],#64
ld1.8 {$A1-$D1},[$inp],#64
eor $A2,$A2,$A0
ldp @K[0],@K[1],[sp,#0]
eor $B2,$B2,$B0
ldp @K[2],@K[3],[sp,#32]
eor $C2,$C2,$C0
eor $D2,$D2,$D0
st1.8 {$A2-$D2},[$out],#64
ld1.8 {$A2-$D2},[$inp],#64
eor $A3,$A3,$A1
eor $B3,$B3,$B1
eor $C3,$C3,$C1
eor $D3,$D3,$D1
st1.8 {$A3-$D3},[$out],#64
ld1.8 {$A3-$D3},[$inp],#64
eor $A4,$A4,$A2
eor $B4,$B4,$B2
eor $C4,$C4,$C2
eor $D4,$D4,$D2
st1.8 {$A4-$D4},[$out],#64
shl $A0,$ONE,#1 // 4 -> 8
eor $A5,$A5,$A3
eor $B5,$B5,$B3
eor $C5,$C5,$C3
eor $D5,$D5,$D3
st1.8 {$A5-$D5},[$out],#64
add @K[3],@K[3],$A0 // += 8
add @K[4],@K[4],$A0
add @K[5],@K[5],$A0
add @K[6],@K[6],$A0
b.hs .Loop_outer_512_neon
adds $len,$len,#512
ushr $A0,$ONE,#2 // 4 -> 1
ldp d8,d9,[sp,#128+0] // meet ABI requirements
ldp d10,d11,[sp,#128+16]
ldp d12,d13,[sp,#128+32]
ldp d14,d15,[sp,#128+48]
stp @K[0],$ONE,[sp,#0] // wipe off-load area
stp @K[0],$ONE,[sp,#32]
stp @K[0],$ONE,[sp,#64]
b.eq .Ldone_512_neon
cmp $len,#192
sub @K[3],@K[3],$A0 // -= 1
sub @K[4],@K[4],$A0
sub @K[5],@K[5],$A0
add sp,sp,#128
b.hs .Loop_outer_neon
eor @K[1],@K[1],@K[1]
eor @K[2],@K[2],@K[2]
eor @K[3],@K[3],@K[3]
eor @K[4],@K[4],@K[4]
eor @K[5],@K[5],@K[5]
eor @K[6],@K[6],@K[6]
b .Loop_outer
.Ldone_512_neon:
ldp x19,x20,[x29,#16]
add sp,sp,#128+64
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
ret
.size ChaCha20_512_neon,.-ChaCha20_512_neon
___
}
}}}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
(s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or
(m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or
(s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or
(m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or
(s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
#s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
print $_,"\n";
}
...@@ -45,6 +45,9 @@ poly1305-x86_64.s: asm/poly1305-x86_64.pl ...@@ -45,6 +45,9 @@ poly1305-x86_64.s: asm/poly1305-x86_64.pl
poly1305-%.S: asm/poly1305-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ poly1305-%.S: asm/poly1305-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
poly1305-armv4.o: poly1305-armv4.S
poly1305-armv8.o: poly1305-armv8.S
files: files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
......
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# IALU(*)/gcc-4.4 NEON
#
# ARM11xx(ARMv6) 7.78/+100% -
# Cortex-A5 6.30/+130% 2.96
# Cortex-A8 6.25/+115% 2.36
# Cortex-A9 5.10/+95% 2.55
# Cortex-A15 3.79/+85% 1.25(**)
# Snapdragon S4 5.70/+100% 1.48(**)
#
# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
# (**) these are trade-off results, they can be improved by ~8% but at
# the cost of 15/12% regression on Cortex-A5/A7, it's even possible
# to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
$flavour = shift;
if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
$code.=<<___;
#include "arm_arch.h"
.text
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32
#endif
.globl poly1305_emit
.globl poly1305_blocks
.globl poly1305_init
.type poly1305_init,%function
.align 5
poly1305_init:
.Lpoly1305_init:
stmdb sp!,{r4-r11}
eor r3,r3,r3
cmp $inp,#0
str r3,[$ctx,#0] @ zero hash value
str r3,[$ctx,#4]
str r3,[$ctx,#8]
str r3,[$ctx,#12]
str r3,[$ctx,#16]
str r3,[$ctx,#36] @ is_base2_26
add $ctx,$ctx,#20
#ifdef __thumb2__
it eq
#endif
moveq r0,#0
beq .Lno_key
#if __ARM_MAX_ARCH__>=7
adr r11,.Lpoly1305_init
ldr r12,.LOPENSSL_armcap
#endif
ldrb r4,[$inp,#0]
mov r10,#0x0fffffff
ldrb r5,[$inp,#1]
and r3,r10,#-4 @ 0x0ffffffc
ldrb r6,[$inp,#2]
ldrb r7,[$inp,#3]
orr r4,r4,r5,lsl#8
ldrb r5,[$inp,#4]
orr r4,r4,r6,lsl#16
ldrb r6,[$inp,#5]
orr r4,r4,r7,lsl#24
ldrb r7,[$inp,#6]
and r4,r4,r10
#if __ARM_MAX_ARCH__>=7
ldr r12,[r11,r12] @ OPENSSL_armcap_P
# ifdef __APPLE__
ldr r12,[r12]
# endif
#endif
ldrb r8,[$inp,#7]
orr r5,r5,r6,lsl#8
ldrb r6,[$inp,#8]
orr r5,r5,r7,lsl#16
ldrb r7,[$inp,#9]
orr r5,r5,r8,lsl#24
ldrb r8,[$inp,#10]
and r5,r5,r3
#if __ARM_MAX_ARCH__>=7
tst r12,#1 @ check for NEON
# ifdef __APPLE__
adr r9,poly1305_blocks_neon
adr r11,poly1305_blocks
# ifdef __thumb2__
it ne
# endif
movne r11,r9
adr r12,poly1305_emit
adr r10,poly1305_emit_neon
# ifdef __thumb2__
it ne
# endif
movne r12,r10
# else
# ifdef __thumb2__
itete eq
# endif
addeq r12,r11,#(poly1305_emit-.Lpoly1305_init)
addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init)
addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
# endif
# ifdef __thumb2__
orr r12,r12,#1 @ thumb-ify address
orr r11,r11,#1
# endif
#endif
ldrb r9,[$inp,#11]
orr r6,r6,r7,lsl#8
ldrb r7,[$inp,#12]
orr r6,r6,r8,lsl#16
ldrb r8,[$inp,#13]
orr r6,r6,r9,lsl#24
ldrb r9,[$inp,#14]
and r6,r6,r3
ldrb r10,[$inp,#15]
orr r7,r7,r8,lsl#8
str r4,[$ctx,#0]
orr r7,r7,r9,lsl#16
str r5,[$ctx,#4]
orr r7,r7,r10,lsl#24
str r6,[$ctx,#8]
and r7,r7,r3
str r7,[$ctx,#12]
#if __ARM_MAX_ARCH__>=7
stmia r2,{r11,r12} @ fill functions table
mov r0,#1
#else
mov r0,#0
#endif
.Lno_key:
ldmia sp!,{r4-r11}
#if __ARM_ARCH__>=5
ret @ bx lr
#else
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size poly1305_init,.-poly1305_init
___
{
my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
my ($s1,$s2,$s3)=($r1,$r2,$r3);
$code.=<<___;
.type poly1305_blocks,%function
.align 5
poly1305_blocks:
stmdb sp!,{r3-r11,lr}
ands $len,$len,#-16
beq .Lno_data
cmp $padbit,#0
add $len,$len,$inp @ end pointer
sub sp,sp,#32
ldmia $ctx,{$h0-$r3} @ load context
str $ctx,[sp,#12] @ offload stuff
mov lr,$inp
str $len,[sp,#16]
str $r1,[sp,#20]
str $r2,[sp,#24]
str $r3,[sp,#28]
b .Loop
.Loop:
#if __ARM_ARCH__<7
ldrb r0,[lr],#16 @ load input
# ifdef __thumb2__
it hi
# endif
addhi $h4,$h4,#1 @ 1<<128
ldrb r1,[lr,#-15]
ldrb r2,[lr,#-14]
ldrb r3,[lr,#-13]
orr r1,r0,r1,lsl#8
ldrb r0,[lr,#-12]
orr r2,r1,r2,lsl#16
ldrb r1,[lr,#-11]
orr r3,r2,r3,lsl#24
ldrb r2,[lr,#-10]
adds $h0,$h0,r3 @ accumulate input
ldrb r3,[lr,#-9]
orr r1,r0,r1,lsl#8
ldrb r0,[lr,#-8]
orr r2,r1,r2,lsl#16
ldrb r1,[lr,#-7]
orr r3,r2,r3,lsl#24
ldrb r2,[lr,#-6]
adcs $h1,$h1,r3
ldrb r3,[lr,#-5]
orr r1,r0,r1,lsl#8
ldrb r0,[lr,#-4]
orr r2,r1,r2,lsl#16
ldrb r1,[lr,#-3]
orr r3,r2,r3,lsl#24
ldrb r2,[lr,#-2]
adcs $h2,$h2,r3
ldrb r3,[lr,#-1]
orr r1,r0,r1,lsl#8
str lr,[sp,#8] @ offload input pointer
orr r2,r1,r2,lsl#16
add $s1,$r1,$r1,lsr#2
orr r3,r2,r3,lsl#24
#else
ldr r0,[lr],#16 @ load input
# ifdef __thumb2__
it hi
# endif
addhi $h4,$h4,#1 @ padbit
ldr r1,[lr,#-12]
ldr r2,[lr,#-8]
ldr r3,[lr,#-4]
# ifdef __ARMEB__
rev r0,r0
rev r1,r1
rev r2,r2
rev r3,r3
# endif
adds $h0,$h0,r0 @ accumulate input
str lr,[sp,#8] @ offload input pointer
adcs $h1,$h1,r1
add $s1,$r1,$r1,lsr#2
adcs $h2,$h2,r2
#endif
add $s2,$r2,$r2,lsr#2
adcs $h3,$h3,r3
add $s3,$r3,$r3,lsr#2
umull r2,r3,$h1,$r0
adc $h4,$h4,#0
umull r0,r1,$h0,$r0
umlal r2,r3,$h4,$s1
umlal r0,r1,$h3,$s1
ldr $r1,[sp,#20] @ reload $r1
umlal r2,r3,$h2,$s3
umlal r0,r1,$h1,$s3
umlal r2,r3,$h3,$s2
umlal r0,r1,$h2,$s2
umlal r2,r3,$h0,$r1
str r0,[sp,#0] @ future $h0
mul r0,$s2,$h4
ldr $r2,[sp,#24] @ reload $r2
adds r2,r2,r1 @ d1+=d0>>32
eor r1,r1,r1
adc lr,r3,#0 @ future $h2
str r2,[sp,#4] @ future $h1
mul r2,$s3,$h4
eor r3,r3,r3
umlal r0,r1,$h3,$s3
ldr $r3,[sp,#28] @ reload $r3
umlal r2,r3,$h3,$r0
umlal r0,r1,$h2,$r0
umlal r2,r3,$h2,$r1
umlal r0,r1,$h1,$r1
umlal r2,r3,$h1,$r2
umlal r0,r1,$h0,$r2
umlal r2,r3,$h0,$r3
ldr $h0,[sp,#0]
mul $h4,$r0,$h4
ldr $h1,[sp,#4]
adds $h2,lr,r0 @ d2+=d1>>32
ldr lr,[sp,#8] @ reload input pointer
adc r1,r1,#0
adds $h3,r2,r1 @ d3+=d2>>32
ldr r0,[sp,#16] @ reload end pointer
adc r3,r3,#0
add $h4,$h4,r3 @ h4+=d3>>32
and r1,$h4,#-4
and $h4,$h4,#3
add r1,r1,r1,lsr#2 @ *=5
adds $h0,$h0,r1
adcs $h1,$h1,#0
adcs $h2,$h2,#0
adc $h3,$h3,#0
cmp r0,lr @ done yet?
bhi .Loop
ldr $ctx,[sp,#12]
add sp,sp,#32
stmia $ctx,{$h0-$h4} @ store the result
.Lno_data:
#if __ARM_ARCH__>=5
ldmia sp!,{r3-r11,pc}
#else
ldmia sp!,{r3-r11,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size poly1305_blocks,.-poly1305_blocks
___
}
{
my ($ctx,$mac,$nonce)=map("r$_",(0..2));
my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
my $g4=$h4;
$code.=<<___;
.type poly1305_emit,%function
.align 5
poly1305_emit:
stmdb sp!,{r4-r11}
.Lpoly1305_emit_enter:
ldmia $ctx,{$h0-$h4}
adds $g0,$h0,#5 @ compare to modulus
adcs $g1,$h1,#0
adcs $g2,$h2,#0
adcs $g3,$h3,#0
adc $g4,$h4,#0
tst $g4,#4 @ did it carry/borrow?
#ifdef __thumb2__
it ne
#endif
movne $h0,$g0
ldr $g0,[$nonce,#0]
#ifdef __thumb2__
it ne
#endif
movne $h1,$g1
ldr $g1,[$nonce,#4]
#ifdef __thumb2__
it ne
#endif
movne $h2,$g2
ldr $g2,[$nonce,#8]
#ifdef __thumb2__
it ne
#endif
movne $h3,$g3
ldr $g3,[$nonce,#12]
adds $h0,$h0,$g0
adcs $h1,$h1,$g1
adcs $h2,$h2,$g2
adc $h3,$h3,$g3
#if __ARM_ARCH__>=7
# ifdef __ARMEB__
rev $h0,$h0
rev $h1,$h1
rev $h2,$h2
rev $h3,$h3
# endif
str $h0,[$mac,#0]
str $h1,[$mac,#4]
str $h2,[$mac,#8]
str $h3,[$mac,#12]
#else
strb $h0,[$mac,#0]
mov $h0,$h0,lsr#8
strb $h1,[$mac,#4]
mov $h1,$h1,lsr#8
strb $h2,[$mac,#8]
mov $h2,$h2,lsr#8
strb $h3,[$mac,#12]
mov $h3,$h3,lsr#8
strb $h0,[$mac,#1]
mov $h0,$h0,lsr#8
strb $h1,[$mac,#5]
mov $h1,$h1,lsr#8
strb $h2,[$mac,#9]
mov $h2,$h2,lsr#8
strb $h3,[$mac,#13]
mov $h3,$h3,lsr#8
strb $h0,[$mac,#2]
mov $h0,$h0,lsr#8
strb $h1,[$mac,#6]
mov $h1,$h1,lsr#8
strb $h2,[$mac,#10]
mov $h2,$h2,lsr#8
strb $h3,[$mac,#14]
mov $h3,$h3,lsr#8
strb $h0,[$mac,#3]
strb $h1,[$mac,#7]
strb $h2,[$mac,#11]
strb $h3,[$mac,#15]
#endif
ldmia sp!,{r4-r11}
#if __ARM_ARCH__>=5
ret @ bx lr
#else
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size poly1305_emit,.-poly1305_emit
___
{
my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.fpu neon
.type poly1305_init_neon,%function
.align 5
poly1305_init_neon:
ldr r4,[$ctx,#20] @ load key base 2^32
ldr r5,[$ctx,#24]
ldr r6,[$ctx,#28]
ldr r7,[$ctx,#32]
and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
mov r3,r4,lsr#26
mov r4,r5,lsr#20
orr r3,r3,r5,lsl#6
mov r5,r6,lsr#14
orr r4,r4,r6,lsl#12
mov r6,r7,lsr#8
orr r5,r5,r7,lsl#18
and r3,r3,#0x03ffffff
and r4,r4,#0x03ffffff
and r5,r5,#0x03ffffff
vdup.32 $R0,r2 @ r^1 in both lanes
add r2,r3,r3,lsl#2 @ *5
vdup.32 $R1,r3
add r3,r4,r4,lsl#2
vdup.32 $S1,r2
vdup.32 $R2,r4
add r4,r5,r5,lsl#2
vdup.32 $S2,r3
vdup.32 $R3,r5
add r5,r6,r6,lsl#2
vdup.32 $S3,r4
vdup.32 $R4,r6
vdup.32 $S4,r5
mov $zeros,#2 @ counter
.Lsquare_neon:
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
@ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
@ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
@ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
@ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
vmull.u32 $D0,$R0,${R0}[1]
vmull.u32 $D1,$R1,${R0}[1]
vmull.u32 $D2,$R2,${R0}[1]
vmull.u32 $D3,$R3,${R0}[1]
vmull.u32 $D4,$R4,${R0}[1]
vmlal.u32 $D0,$R4,${S1}[1]
vmlal.u32 $D1,$R0,${R1}[1]
vmlal.u32 $D2,$R1,${R1}[1]
vmlal.u32 $D3,$R2,${R1}[1]
vmlal.u32 $D4,$R3,${R1}[1]
vmlal.u32 $D0,$R3,${S2}[1]
vmlal.u32 $D1,$R4,${S2}[1]
vmlal.u32 $D3,$R1,${R2}[1]
vmlal.u32 $D2,$R0,${R2}[1]
vmlal.u32 $D4,$R2,${R2}[1]
vmlal.u32 $D0,$R2,${S3}[1]
vmlal.u32 $D3,$R0,${R3}[1]
vmlal.u32 $D1,$R3,${S3}[1]
vmlal.u32 $D2,$R4,${S3}[1]
vmlal.u32 $D4,$R1,${R3}[1]
vmlal.u32 $D3,$R4,${S4}[1]
vmlal.u32 $D0,$R1,${S4}[1]
vmlal.u32 $D1,$R2,${S4}[1]
vmlal.u32 $D2,$R3,${S4}[1]
vmlal.u32 $D4,$R0,${R4}[1]
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
@ and P. Schwabe
vshr.u64 $T0,$D3,#26
vmovn.i64 $D3#lo,$D3
vshr.u64 $T1,$D0,#26
vmovn.i64 $D0#lo,$D0
vadd.i64 $D4,$D4,$T0 @ h3 -> h4
vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
vadd.i64 $D1,$D1,$T1 @ h0 -> h1
vbic.i32 $D0#lo,#0xfc000000
vshrn.u64 $T0#lo,$D4,#26
vmovn.i64 $D4#lo,$D4
vshr.u64 $T1,$D1,#26
vmovn.i64 $D1#lo,$D1
vadd.i64 $D2,$D2,$T1 @ h1 -> h2
vbic.i32 $D4#lo,#0xfc000000
vbic.i32 $D1#lo,#0xfc000000
vadd.i32 $D0#lo,$D0#lo,$T0#lo
vshl.u32 $T0#lo,$T0#lo,#2
vshrn.u64 $T1#lo,$D2,#26
vmovn.i64 $D2#lo,$D2
vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
vbic.i32 $D2#lo,#0xfc000000
vshr.u32 $T0#lo,$D0#lo,#26
vbic.i32 $D0#lo,#0xfc000000
vshr.u32 $T1#lo,$D3#lo,#26
vbic.i32 $D3#lo,#0xfc000000
vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
subs $zeros,$zeros,#1
beq .Lsquare_break_neon
add $tbl0,$ctx,#(48+0*9*4)
add $tbl1,$ctx,#(48+1*9*4)
vtrn.32 $R0,$D0#lo @ r^2:r^1
vtrn.32 $R2,$D2#lo
vtrn.32 $R3,$D3#lo
vtrn.32 $R1,$D1#lo
vtrn.32 $R4,$D4#lo
vshl.u32 $S2,$R2,#2 @ *5
vshl.u32 $S3,$R3,#2
vshl.u32 $S1,$R1,#2
vshl.u32 $S4,$R4,#2
vadd.i32 $S2,$S2,$R2
vadd.i32 $S1,$S1,$R1
vadd.i32 $S3,$S3,$R3
vadd.i32 $S4,$S4,$R4
vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
vst1.32 {${S4}[0]},[$tbl0,:32]
vst1.32 {${S4}[1]},[$tbl1,:32]
b .Lsquare_neon
.align 4
.Lsquare_break_neon:
add $tbl0,$ctx,#(48+2*4*9)
add $tbl1,$ctx,#(48+3*4*9)
vmov $R0,$D0#lo @ r^4:r^3
vshl.u32 $S1,$D1#lo,#2 @ *5
vmov $R1,$D1#lo
vshl.u32 $S2,$D2#lo,#2
vmov $R2,$D2#lo
vshl.u32 $S3,$D3#lo,#2
vmov $R3,$D3#lo
vshl.u32 $S4,$D4#lo,#2
vmov $R4,$D4#lo
vadd.i32 $S1,$S1,$D1#lo
vadd.i32 $S2,$S2,$D2#lo
vadd.i32 $S3,$S3,$D3#lo
vadd.i32 $S4,$S4,$D4#lo
vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
vst1.32 {${S4}[0]},[$tbl0]
vst1.32 {${S4}[1]},[$tbl1]
ret @ bx lr
.size poly1305_init_neon,.-poly1305_init_neon
.type poly1305_blocks_neon,%function
.align 5
poly1305_blocks_neon:
ldr ip,[$ctx,#36] @ is_base2_26
ands $len,$len,#-16
beq .Lno_data_neon
cmp $len,#64
bhs .Lenter_neon
tst ip,ip @ is_base2_26?
beq poly1305_blocks
.Lenter_neon:
stmdb sp!,{r4-r7}
vstmdb sp!,{d8-d15} @ ABI specification says so
tst ip,ip @ is_base2_26?
bne .Lbase2_26_neon
stmdb sp!,{r1-r3,lr}
bl poly1305_init_neon
ldr r4,[$ctx,#0] @ load hash value base 2^32
ldr r5,[$ctx,#4]
ldr r6,[$ctx,#8]
ldr r7,[$ctx,#12]
ldr ip,[$ctx,#16]
and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
mov r3,r4,lsr#26
veor $D0#lo,$D0#lo,$D0#lo
mov r4,r5,lsr#20
orr r3,r3,r5,lsl#6
veor $D1#lo,$D1#lo,$D1#lo
mov r5,r6,lsr#14
orr r4,r4,r6,lsl#12
veor $D2#lo,$D2#lo,$D2#lo
mov r6,r7,lsr#8
orr r5,r5,r7,lsl#18
veor $D3#lo,$D3#lo,$D3#lo
and r3,r3,#0x03ffffff
orr r6,r6,ip,lsl#24
veor $D4#lo,$D4#lo,$D4#lo
and r4,r4,#0x03ffffff
mov r1,#1
and r5,r5,#0x03ffffff
str r1,[$ctx,#36] @ is_base2_26
vmov.32 $D0#lo[0],r2
vmov.32 $D1#lo[0],r3
vmov.32 $D2#lo[0],r4
vmov.32 $D3#lo[0],r5
vmov.32 $D4#lo[0],r6
adr $zeros,.Lzeros
ldmia sp!,{r1-r3,lr}
b .Lbase2_32_neon
.align 4
.Lbase2_26_neon:
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ load hash value
veor $D0#lo,$D0#lo,$D0#lo
veor $D1#lo,$D1#lo,$D1#lo
veor $D2#lo,$D2#lo,$D2#lo
veor $D3#lo,$D3#lo,$D3#lo
veor $D4#lo,$D4#lo,$D4#lo
vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
adr $zeros,.Lzeros
vld1.32 {$D4#lo[0]},[$ctx]
sub $ctx,$ctx,#16 @ rewind
.Lbase2_32_neon:
add $in2,$inp,#32
mov $padbit,$padbit,lsl#24
tst $len,#31
beq .Leven
vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
vmov.32 $H4#lo[0],$padbit
sub $len,$len,#16
add $in2,$inp,#32
# ifdef __ARMEB__
vrev32.8 $H0,$H0
vrev32.8 $H3,$H3
vrev32.8 $H1,$H1
vrev32.8 $H2,$H2
# endif
vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
vshl.u32 $H3#lo,$H3#lo,#18
vsri.u32 $H3#lo,$H2#lo,#14
vshl.u32 $H2#lo,$H2#lo,#12
vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
vbic.i32 $H3#lo,#0xfc000000
vsri.u32 $H2#lo,$H1#lo,#20
vshl.u32 $H1#lo,$H1#lo,#6
vbic.i32 $H2#lo,#0xfc000000
vsri.u32 $H1#lo,$H0#lo,#26
vadd.i32 $H3#hi,$H3#lo,$D3#lo
vbic.i32 $H0#lo,#0xfc000000
vbic.i32 $H1#lo,#0xfc000000
vadd.i32 $H2#hi,$H2#lo,$D2#lo
vadd.i32 $H0#hi,$H0#lo,$D0#lo
vadd.i32 $H1#hi,$H1#lo,$D1#lo
mov $tbl1,$zeros
add $tbl0,$ctx,#48
cmp $len,$len
b .Long_tail
.align 4
.Leven:
subs $len,$len,#64
# ifdef __thumb2__
it lo
# endif
movlo $in2,$zeros
vmov.i32 $H4,#1<<24 @ padbit, yes, always
vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
add $inp,$inp,#64
vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
add $in2,$in2,#64
# ifdef __thumb2__
itt hi
# endif
addhi $tbl1,$ctx,#(48+1*9*4)
addhi $tbl0,$ctx,#(48+3*9*4)
# ifdef __ARMEB__
vrev32.8 $H0,$H0
vrev32.8 $H3,$H3
vrev32.8 $H1,$H1
vrev32.8 $H2,$H2
# endif
vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
vshl.u32 $H3,$H3,#18
vsri.u32 $H3,$H2,#14
vshl.u32 $H2,$H2,#12
vbic.i32 $H3,#0xfc000000
vsri.u32 $H2,$H1,#20
vshl.u32 $H1,$H1,#6
vbic.i32 $H2,#0xfc000000
vsri.u32 $H1,$H0,#26
vbic.i32 $H0,#0xfc000000
vbic.i32 $H1,#0xfc000000
bls .Lskip_loop
vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
b .Loop_neon
.align 5
.Loop_neon:
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
@ \___________________/
@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
@ \___________________/ \____________________/
@
@ Note that we start with inp[2:3]*r^2. This is because it
@ doesn't depend on reduction in previous iteration.
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
@ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
@ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
@ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ inp[2:3]*r^2
vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
vmull.u32 $D2,$H2#hi,${R0}[1]
vadd.i32 $H0#lo,$H0#lo,$D0#lo
vmull.u32 $D0,$H0#hi,${R0}[1]
vadd.i32 $H3#lo,$H3#lo,$D3#lo
vmull.u32 $D3,$H3#hi,${R0}[1]
vmlal.u32 $D2,$H1#hi,${R1}[1]
vadd.i32 $H1#lo,$H1#lo,$D1#lo
vmull.u32 $D1,$H1#hi,${R0}[1]
vadd.i32 $H4#lo,$H4#lo,$D4#lo
vmull.u32 $D4,$H4#hi,${R0}[1]
subs $len,$len,#64
vmlal.u32 $D0,$H4#hi,${S1}[1]
# ifdef __thumb2__
it lo
# endif
movlo $in2,$zeros
vmlal.u32 $D3,$H2#hi,${R1}[1]
vld1.32 ${S4}[1],[$tbl1,:32]
vmlal.u32 $D1,$H0#hi,${R1}[1]
vmlal.u32 $D4,$H3#hi,${R1}[1]
vmlal.u32 $D0,$H3#hi,${S2}[1]
vmlal.u32 $D3,$H1#hi,${R2}[1]
vmlal.u32 $D4,$H2#hi,${R2}[1]
vmlal.u32 $D1,$H4#hi,${S2}[1]
vmlal.u32 $D2,$H0#hi,${R2}[1]
vmlal.u32 $D3,$H0#hi,${R3}[1]
vmlal.u32 $D0,$H2#hi,${S3}[1]
vmlal.u32 $D4,$H1#hi,${R3}[1]
vmlal.u32 $D1,$H3#hi,${S3}[1]
vmlal.u32 $D2,$H4#hi,${S3}[1]
vmlal.u32 $D3,$H4#hi,${S4}[1]
vmlal.u32 $D0,$H1#hi,${S4}[1]
vmlal.u32 $D4,$H0#hi,${R4}[1]
vmlal.u32 $D1,$H2#hi,${S4}[1]
vmlal.u32 $D2,$H3#hi,${S4}[1]
vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
add $in2,$in2,#64
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ (hash+inp[0:1])*r^4 and accumulate
vmlal.u32 $D3,$H3#lo,${R0}[0]
vmlal.u32 $D0,$H0#lo,${R0}[0]
vmlal.u32 $D4,$H4#lo,${R0}[0]
vmlal.u32 $D1,$H1#lo,${R0}[0]
vmlal.u32 $D2,$H2#lo,${R0}[0]
vld1.32 ${S4}[0],[$tbl0,:32]
vmlal.u32 $D3,$H2#lo,${R1}[0]
vmlal.u32 $D0,$H4#lo,${S1}[0]
vmlal.u32 $D4,$H3#lo,${R1}[0]
vmlal.u32 $D1,$H0#lo,${R1}[0]
vmlal.u32 $D2,$H1#lo,${R1}[0]
vmlal.u32 $D3,$H1#lo,${R2}[0]
vmlal.u32 $D0,$H3#lo,${S2}[0]
vmlal.u32 $D4,$H2#lo,${R2}[0]
vmlal.u32 $D1,$H4#lo,${S2}[0]
vmlal.u32 $D2,$H0#lo,${R2}[0]
vmlal.u32 $D3,$H0#lo,${R3}[0]
vmlal.u32 $D0,$H2#lo,${S3}[0]
vmlal.u32 $D4,$H1#lo,${R3}[0]
vmlal.u32 $D1,$H3#lo,${S3}[0]
vmlal.u32 $D3,$H4#lo,${S4}[0]
vmlal.u32 $D2,$H4#lo,${S3}[0]
vmlal.u32 $D0,$H1#lo,${S4}[0]
vmlal.u32 $D4,$H0#lo,${R4}[0]
vmov.i32 $H4,#1<<24 @ padbit, yes, always
vmlal.u32 $D1,$H2#lo,${S4}[0]
vmlal.u32 $D2,$H3#lo,${S4}[0]
vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
add $inp,$inp,#64
# ifdef __ARMEB__
vrev32.8 $H0,$H0
vrev32.8 $H1,$H1
vrev32.8 $H2,$H2
vrev32.8 $H3,$H3
# endif
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ lazy reduction interleaved with base 2^32 -> base 2^26
vshr.u64 $T0,$D3,#26
vmovn.i64 $D3#lo,$D3
vshr.u64 $T1,$D0,#26
vmovn.i64 $D0#lo,$D0
vadd.i64 $D4,$D4,$T0 @ h3 -> h4
vbic.i32 $D3#lo,#0xfc000000
vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
vadd.i64 $D1,$D1,$T1 @ h0 -> h1
vshl.u32 $H3,$H3,#18
vbic.i32 $D0#lo,#0xfc000000
vshrn.u64 $T0#lo,$D4,#26
vmovn.i64 $D4#lo,$D4
vshr.u64 $T1,$D1,#26
vmovn.i64 $D1#lo,$D1
vadd.i64 $D2,$D2,$T1 @ h1 -> h2
vsri.u32 $H3,$H2,#14
vbic.i32 $D4#lo,#0xfc000000
vshl.u32 $H2,$H2,#12
vbic.i32 $D1#lo,#0xfc000000
vadd.i32 $D0#lo,$D0#lo,$T0#lo
vshl.u32 $T0#lo,$T0#lo,#2
vbic.i32 $H3,#0xfc000000
vshrn.u64 $T1#lo,$D2,#26
vmovn.i64 $D2#lo,$D2
vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
vsri.u32 $H2,$H1,#20
vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
vshl.u32 $H1,$H1,#6
vbic.i32 $D2#lo,#0xfc000000
vbic.i32 $H2,#0xfc000000
vshr.u32 $T0#lo,$D0#lo,#26
vbic.i32 $D0#lo,#0xfc000000
vsri.u32 $H1,$H0,#26
vbic.i32 $H0,#0xfc000000
vshr.u32 $T1#lo,$D3#lo,#26
vbic.i32 $D3#lo,#0xfc000000
vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
vbic.i32 $H1,#0xfc000000
bhi .Loop_neon
.Lskip_loop:
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
add $tbl1,$ctx,#(48+0*9*4)
add $tbl0,$ctx,#(48+1*9*4)
adds $len,$len,#32
# ifdef __thumb2__
it ne
# endif
movne $len,#0
bne .Long_tail
vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
vadd.i32 $H0#hi,$H0#lo,$D0#lo
vadd.i32 $H3#hi,$H3#lo,$D3#lo
vadd.i32 $H1#hi,$H1#lo,$D1#lo
vadd.i32 $H4#hi,$H4#lo,$D4#lo
.Long_tail:
vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
vmull.u32 $D2,$H2#hi,$R0
vadd.i32 $H0#lo,$H0#lo,$D0#lo
vmull.u32 $D0,$H0#hi,$R0
vadd.i32 $H3#lo,$H3#lo,$D3#lo
vmull.u32 $D3,$H3#hi,$R0
vadd.i32 $H1#lo,$H1#lo,$D1#lo
vmull.u32 $D1,$H1#hi,$R0
vadd.i32 $H4#lo,$H4#lo,$D4#lo
vmull.u32 $D4,$H4#hi,$R0
vmlal.u32 $D0,$H4#hi,$S1
vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
vmlal.u32 $D3,$H2#hi,$R1
vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
vmlal.u32 $D1,$H0#hi,$R1
vmlal.u32 $D4,$H3#hi,$R1
vmlal.u32 $D2,$H1#hi,$R1
vmlal.u32 $D3,$H1#hi,$R2
vld1.32 ${S4}[1],[$tbl1,:32]
vmlal.u32 $D0,$H3#hi,$S2
vld1.32 ${S4}[0],[$tbl0,:32]
vmlal.u32 $D4,$H2#hi,$R2
vmlal.u32 $D1,$H4#hi,$S2
vmlal.u32 $D2,$H0#hi,$R2
vmlal.u32 $D3,$H0#hi,$R3
# ifdef __thumb2__
it ne
# endif
addne $tbl1,$ctx,#(48+2*9*4)
vmlal.u32 $D0,$H2#hi,$S3
# ifdef __thumb2__
it ne
# endif
addne $tbl0,$ctx,#(48+3*9*4)
vmlal.u32 $D4,$H1#hi,$R3
vmlal.u32 $D1,$H3#hi,$S3
vmlal.u32 $D2,$H4#hi,$S3
vmlal.u32 $D3,$H4#hi,$S4
vmov.u64 $MASK,#-1 @ can be redundant
vmlal.u32 $D0,$H1#hi,$S4
vshr.u64 $MASK,$MASK,#38
vmlal.u32 $D4,$H0#hi,$R4
vmlal.u32 $D1,$H2#hi,$S4
vmlal.u32 $D2,$H3#hi,$S4
beq .Lshort_tail
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ (hash+inp[0:1])*r^4:r^3 and accumulate
vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
vmlal.u32 $D2,$H2#lo,$R0
vmlal.u32 $D0,$H0#lo,$R0
vmlal.u32 $D3,$H3#lo,$R0
vmlal.u32 $D1,$H1#lo,$R0
vmlal.u32 $D4,$H4#lo,$R0
vmlal.u32 $D0,$H4#lo,$S1
vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
vmlal.u32 $D3,$H2#lo,$R1
vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
vmlal.u32 $D1,$H0#lo,$R1
vmlal.u32 $D4,$H3#lo,$R1
vmlal.u32 $D2,$H1#lo,$R1
vmlal.u32 $D3,$H1#lo,$R2
vld1.32 ${S4}[1],[$tbl1,:32]
vmlal.u32 $D0,$H3#lo,$S2
vld1.32 ${S4}[0],[$tbl0,:32]
vmlal.u32 $D4,$H2#lo,$R2
vmlal.u32 $D1,$H4#lo,$S2
vmlal.u32 $D2,$H0#lo,$R2
vmlal.u32 $D3,$H0#lo,$R3
vmlal.u32 $D0,$H2#lo,$S3
vmlal.u32 $D4,$H1#lo,$R3
vmlal.u32 $D1,$H3#lo,$S3
vmlal.u32 $D2,$H4#lo,$S3
vmlal.u32 $D3,$H4#lo,$S4
vmov.u64 $MASK,#-1
vmlal.u32 $D0,$H1#lo,$S4
vshr.u64 $MASK,$MASK,#38
vmlal.u32 $D4,$H0#lo,$R4
vmlal.u32 $D1,$H2#lo,$S4
vmlal.u32 $D2,$H3#lo,$S4
.Lshort_tail:
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ lazy reduction, but without narrowing
vshr.u64 $T0,$D3,#26
vand.i64 $D3,$D3,$MASK
vshr.u64 $T1,$D0,#26
vand.i64 $D0,$D0,$MASK
vadd.i64 $D4,$D4,$T0 @ h3 -> h4
vadd.i64 $D1,$D1,$T1 @ h0 -> h1
vshr.u64 $T0,$D4,#26
vand.i64 $D4,$D4,$MASK
vshr.u64 $T1,$D1,#26
vand.i64 $D1,$D1,$MASK
vadd.i64 $D2,$D2,$T1 @ h1 -> h2
vadd.i64 $D0,$D0,$T0
vshl.u64 $T0,$T0,#2
vshr.u64 $T1,$D2,#26
vand.i64 $D2,$D2,$MASK
vadd.i64 $D0,$D0,$T0 @ h4 -> h0
vadd.i64 $D3,$D3,$T1 @ h2 -> h3
vshr.u64 $T0,$D0,#26
vand.i64 $D0,$D0,$MASK
vshr.u64 $T1,$D3,#26
vand.i64 $D3,$D3,$MASK
vadd.i64 $D1,$D1,$T0 @ h0 -> h1
vadd.i64 $D4,$D4,$T1 @ h3 -> h4
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ horizontal addition
vadd.i64 $D2#lo,$D2#lo,$D2#hi
vadd.i64 $D0#lo,$D0#lo,$D0#hi
vadd.i64 $D3#lo,$D3#lo,$D3#hi
vadd.i64 $D1#lo,$D1#lo,$D1#hi
vadd.i64 $D4#lo,$D4#lo,$D4#hi
cmp $len,#0
bne .Leven
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ store hash value
vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
vst1.32 {$D4#lo[0]},[$ctx]
vldmia sp!,{d8-d15} @ epilogue
ldmia sp!,{r4-r7}
.Lno_data_neon:
ret @ bx lr
.size poly1305_blocks_neon,.-poly1305_blocks_neon
.type poly1305_emit_neon,%function
.align 5
poly1305_emit_neon:
ldr ip,[$ctx,#36] @ is_base2_26
stmdb sp!,{r4-r11}
tst ip,ip
beq .Lpoly1305_emit_enter
ldmia $ctx,{$h0-$h4}
eor $g0,$g0,$g0
adds $h0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
mov $h1,$h1,lsr#6
adcs $h1,$h1,$h2,lsl#20
mov $h2,$h2,lsr#12
adcs $h2,$h2,$h3,lsl#14
mov $h3,$h3,lsr#18
adcs $h3,$h3,$h4,lsl#8
adc $h4,$g0,$h4,lsr#24 @ can be partially reduced ...
and $g0,$h4,#-4 @ ... so reduce
and $h4,$h3,#3
add $g0,$g0,$g0,lsr#2 @ *= 5
adds $h0,$h0,$g0
adcs $h1,$h1,#0
adcs $h2,$h2,#0
adc $h3,$h3,#0
adds $g0,$h0,#5 @ compare to modulus
adcs $g1,$h1,#0
adcs $g2,$h2,#0
adcs $g3,$h3,#0
adc $g4,$h4,#0
tst $g4,#4 @ did it carry/borrow?
# ifdef __thumb2__
it ne
# endif
movne $h0,$g0
ldr $g0,[$nonce,#0]
# ifdef __thumb2__
it ne
# endif
movne $h1,$g1
ldr $g1,[$nonce,#4]
# ifdef __thumb2__
it ne
# endif
movne $h2,$g2
ldr $g2,[$nonce,#8]
# ifdef __thumb2__
it ne
# endif
movne $h3,$g3
ldr $g3,[$nonce,#12]
adds $h0,$h0,$g0 @ accumulate nonce
adcs $h1,$h1,$g1
adcs $h2,$h2,$g2
adc $h3,$h3,$g3
# ifdef __ARMEB__
rev $h0,$h0
rev $h1,$h1
rev $h2,$h2
rev $h3,$h3
# endif
str $h0,[$mac,#0] @ store the result
str $h1,[$mac,#4]
str $h2,[$mac,#8]
str $h3,[$mac,#12]
ldmia sp!,{r4-r11}
ret @ bx lr
.size poly1305_emit_neon,.-poly1305_emit_neon
.align 5
.Lzeros:
.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-.Lpoly1305_init
#endif
___
} }
$code.=<<___;
.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
#if __ARM_MAX_ARCH__>=7
.comm OPENSSL_armcap_P,4,4
#endif
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/\bret\b/bx lr/go or
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
print $_,"\n";
}
close STDOUT; # enforce flush
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements Poly1305 hash for ARMv8.
#
# June 2015
#
# Numbers are cycles per processed byte with poly1305_blocks alone.
#
# IALU/gcc-4.9 NEON
#
# Apple A7 1.86/+5% 0.72
# Cortex-A53 2.63/+58% 1.47
# Cortex-A57 2.70/+7% 1.14
# Denver 1.39/+50% 1.18(*)
# X-Gene 2.00/+68% 2.19
#
# (*) estimate based on resources availability is less than 1.0,
# i.e. measured result is worse than expected, presumably binary
# translator is not almighty;
$flavour=shift;
$output=shift;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
my ($mac,$nonce)=($inp,$len);
my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
$code.=<<___;
#include "arm_arch.h"
.text
// forward "declarations" are required for Apple
.extern OPENSSL_armcap_P
.globl poly1305_blocks
.globl poly1305_emit
.globl poly1305_init
.type poly1305_init,%function
.align 5
poly1305_init:
cmp $inp,xzr
stp xzr,xzr,[$ctx] // zero hash value
stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
csel x0,xzr,x0,eq
b.eq .Lno_key
#ifdef __ILP32__
ldrsw $t1,.LOPENSSL_armcap_P
#else
ldr $t1,.LOPENSSL_armcap_P
#endif
adr $t0,.LOPENSSL_armcap_P
ldp $r0,$r1,[$inp] // load key
mov $s1,#0xfffffffc0fffffff
movk $s1,#0x0fff,lsl#48
ldr w17,[$t0,$t1]
#ifdef __ARMEB__
rev $r0,$r0 // flip bytes
rev $r1,$r1
#endif
and $r0,$r0,$s1 // &=0ffffffc0fffffff
and $s1,$s1,#-4
and $r1,$r1,$s1 // &=0ffffffc0ffffffc
stp $r0,$r1,[$ctx,#32] // save key value
tst w17,#ARMV7_NEON
adr $d0,poly1305_blocks
adr $r0,poly1305_blocks_neon
adr $d1,poly1305_emit
adr $r1,poly1305_emit_neon
csel $d0,$d0,$r0,eq
csel $d1,$d1,$r1,eq
stp $d0,$d1,[$len]
mov x0,#1
.Lno_key:
ret
.size poly1305_init,.-poly1305_init
.type poly1305_blocks,%function
.align 5
poly1305_blocks:
ands $len,$len,#-16
b.eq .Lno_data
ldp $h0,$h1,[$ctx] // load hash value
ldp $r0,$r1,[$ctx,#32] // load key value
ldr $h2,[$ctx,#16]
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
b .Loop
.align 5
.Loop:
ldp $t0,$t1,[$inp],#16 // load input
sub $len,$len,#16
#ifdef __ARMEB__
rev $t0,$t0
rev $t1,$t1
#endif
adds $h0,$h0,$t0 // accumulate input
adcs $h1,$h1,$t1
mul $d0,$h0,$r0 // h0*r0
adc $h2,$h2,$padbit
umulh $d1,$h0,$r0
mul $t0,$h1,$s1 // h1*5*r1
umulh $t1,$h1,$s1
adds $d0,$d0,$t0
mul $t0,$h0,$r1 // h0*r1
adc $d1,$d1,$t1
umulh $d2,$h0,$r1
adds $d1,$d1,$t0
mul $t0,$h1,$r0 // h1*r0
adc $d2,$d2,xzr
umulh $t1,$h1,$r0
adds $d1,$d1,$t0
mul $t0,$h2,$s1 // h2*5*r1
adc $d2,$d2,$t1
mul $t1,$h2,$r0 // h2*r0
adds $d1,$d1,$t0
adc $d2,$d2,$t1
and $t0,$d2,#-4 // final reduction
and $h2,$d2,#3
add $t0,$t0,$d2,lsr#2
adds $h0,$d0,$t0
adc $h1,$d1,xzr
cbnz $len,.Loop
stp $h0,$h1,[$ctx] // store hash value
str $h2,[$ctx,#16]
.Lno_data:
ret
.size poly1305_blocks,.-poly1305_blocks
.type poly1305_emit,%function
.align 5
poly1305_emit:
ldp $h0,$h1,[$ctx] // load hash base 2^64
ldr $h2,[$ctx,#16]
ldp $t0,$t1,[$nonce] // load nonce
adds $d0,$h0,#5 // compare to modulus
adcs $d1,$h1,xzr
adc $d2,$h2,xzr
tst $d2,#-4 // see if it's carried/borrowed
csel $h0,$h0,$d0,eq
csel $h1,$h1,$d1,eq
#ifdef __ARMEB__
ror $t0,$t0,#32 // flip nonce words
ror $t1,$t1,#32
#endif
adds $h0,$h0,$t0 // accumulate nonce
adc $h1,$h1,$t1
#ifdef __ARMEB__
rev $h0,$h0 // flip output bytes
rev $h1,$h1
#endif
stp $h0,$h1,[$mac] // write result
ret
.size poly1305_emit,.-poly1305_emit
___
my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
my ($T0,$T1,$MASK) = map("v$_",(29..31));
my ($in2,$zeros)=("x16","x17");
my $is_base2_26 = $zeros; # borrow
$code.=<<___;
.type poly1305_mult,%function
.align 5
poly1305_mult:
mul $d0,$h0,$r0 // h0*r0
umulh $d1,$h0,$r0
mul $t0,$h1,$s1 // h1*5*r1
umulh $t1,$h1,$s1
adds $d0,$d0,$t0
mul $t0,$h0,$r1 // h0*r1
adc $d1,$d1,$t1
umulh $d2,$h0,$r1
adds $d1,$d1,$t0
mul $t0,$h1,$r0 // h1*r0
adc $d2,$d2,xzr
umulh $t1,$h1,$r0
adds $d1,$d1,$t0
mul $t0,$h2,$s1 // h2*5*r1
adc $d2,$d2,$t1
mul $t1,$h2,$r0 // h2*r0
adds $d1,$d1,$t0
adc $d2,$d2,$t1
and $t0,$d2,#-4 // final reduction
and $h2,$d2,#3
add $t0,$t0,$d2,lsr#2
adds $h0,$d0,$t0
adc $h1,$d1,xzr
ret
.size poly1305_mult,.-poly1305_mult
.type poly1305_splat,%function
.align 5
poly1305_splat:
and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
ubfx x13,$h0,#26,#26
extr x14,$h1,$h0,#52
and x14,x14,#0x03ffffff
ubfx x15,$h1,#14,#26
extr x16,$h2,$h1,#40
str w12,[$ctx,#16*0] // r0
add w12,w13,w13,lsl#2 // r1*5
str w13,[$ctx,#16*1] // r1
add w13,w14,w14,lsl#2 // r2*5
str w12,[$ctx,#16*2] // s1
str w14,[$ctx,#16*3] // r2
add w14,w15,w15,lsl#2 // r3*5
str w13,[$ctx,#16*4] // s2
str w15,[$ctx,#16*5] // r3
add w15,w16,w16,lsl#2 // r4*5
str w14,[$ctx,#16*6] // s3
str w16,[$ctx,#16*7] // r4
str w15,[$ctx,#16*8] // s4
ret
.size poly1305_splat,.-poly1305_splat
.type poly1305_blocks_neon,%function
.align 5
poly1305_blocks_neon:
ldr $is_base2_26,[$ctx,#24]
cmp $len,#128
b.hs .Lblocks_neon
cbz $is_base2_26,poly1305_blocks
.Lblocks_neon:
stp x29,x30,[sp,#-80]!
add x29,sp,#0
ands $len,$len,#-16
b.eq .Lno_data_neon
cbz $is_base2_26,.Lbase2_64_neon
ldp w10,w11,[$ctx] // load hash value base 2^26
ldp w12,w13,[$ctx,#8]
ldr w14,[$ctx,#16]
tst $len,#31
b.eq .Leven_neon
ldp $r0,$r1,[$ctx,#32] // load key value
add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
lsr $h1,x12,#12
adds $h0,$h0,x12,lsl#52
add $h1,$h1,x13,lsl#14
adc $h1,$h1,xzr
lsr $h2,x14,#24
adds $h1,$h1,x14,lsl#40
adc $d2,$h2,xzr // can be partially reduced...
ldp $d0,$d1,[$inp],#16 // load input
sub $len,$len,#16
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
and $t0,$d2,#-4 // ... so reduce
and $h2,$d2,#3
add $t0,$t0,$d2,lsr#2
adds $h0,$h0,$t0
adc $h1,$h1,xzr
#ifdef __ARMEB__
rev $d0,$d0
rev $d1,$d1
#endif
adds $h0,$h0,$d0 // accumulate input
adcs $h1,$h1,$d1
adc $h2,$h2,$padbit
bl poly1305_mult
ldr x30,[sp,#8]
cbz $padbit,.Lstore_base2_64_neon
and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
ubfx x11,$h0,#26,#26
extr x12,$h1,$h0,#52
and x12,x12,#0x03ffffff
ubfx x13,$h1,#14,#26
extr x14,$h2,$h1,#40
cbnz $len,.Leven_neon
stp w10,w11,[$ctx] // store hash value base 2^26
stp w12,w13,[$ctx,#8]
str w14,[$ctx,#16]
b .Lno_data_neon
.align 4
.Lstore_base2_64_neon:
stp $h0,$h1,[$ctx] // store hash value base 2^64
stp $h2,xzr,[$ctx,#16] // note that is_base2_26 is zeroed
b .Lno_data_neon
.align 4
.Lbase2_64_neon:
ldp $r0,$r1,[$ctx,#32] // load key value
ldp $h0,$h1,[$ctx] // load hash value base 2^64
ldr $h2,[$ctx,#16]
tst $len,#31
b.eq .Linit_neon
ldp $d0,$d1,[$inp],#16 // load input
sub $len,$len,#16
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
#ifdef __ARMEB__
rev $d0,$d0
rev $d1,$d1
#endif
adds $h0,$h0,$d0 // accumulate input
adcs $h1,$h1,$d1
adc $h2,$h2,$padbit
bl poly1305_mult
.Linit_neon:
and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
ubfx x11,$h0,#26,#26
extr x12,$h1,$h0,#52
and x12,x12,#0x03ffffff
ubfx x13,$h1,#14,#26
extr x14,$h2,$h1,#40
stp d8,d9,[sp,#16] // meet ABI requirements
stp d10,d11,[sp,#32]
stp d12,d13,[sp,#48]
stp d14,d15,[sp,#64]
fmov ${H0},x10
fmov ${H1},x11
fmov ${H2},x12
fmov ${H3},x13
fmov ${H4},x14
////////////////////////////////// initialize r^n table
mov $h0,$r0 // r^1
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
mov $h1,$r1
mov $h2,xzr
add $ctx,$ctx,#48+12
bl poly1305_splat
bl poly1305_mult // r^2
sub $ctx,$ctx,#4
bl poly1305_splat
bl poly1305_mult // r^3
sub $ctx,$ctx,#4
bl poly1305_splat
bl poly1305_mult // r^4
sub $ctx,$ctx,#4
bl poly1305_splat
ldr x30,[sp,#8]
add $in2,$inp,#32
adr $zeros,.Lzeros
subs $len,$len,#64
csel $in2,$zeros,$in2,lo
mov x4,#1
str x4,[$ctx,#-24] // set is_base2_26
sub $ctx,$ctx,#48 // restore original $ctx
b .Ldo_neon
.align 4
.Leven_neon:
add $in2,$inp,#32
adr $zeros,.Lzeros
subs $len,$len,#64
csel $in2,$zeros,$in2,lo
stp d8,d9,[sp,#16] // meet ABI requirements
stp d10,d11,[sp,#32]
stp d12,d13,[sp,#48]
stp d14,d15,[sp,#64]
fmov ${H0},x10
fmov ${H1},x11
fmov ${H2},x12
fmov ${H3},x13
fmov ${H4},x14
.Ldo_neon:
ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
ldp x9,x13,[$in2],#48
lsl $padbit,$padbit,#24
add x15,$ctx,#48
#ifdef __ARMEB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
and x5,x9,#0x03ffffff
ubfx x6,x8,#26,#26
ubfx x7,x9,#26,#26
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
extr x8,x12,x8,#52
extr x9,x13,x9,#52
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
fmov $IN23_0,x4
and x8,x8,#0x03ffffff
and x9,x9,#0x03ffffff
ubfx x10,x12,#14,#26
ubfx x11,x13,#14,#26
add x12,$padbit,x12,lsr#40
add x13,$padbit,x13,lsr#40
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
fmov $IN23_1,x6
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
fmov $IN23_2,x8
fmov $IN23_3,x10
fmov $IN23_4,x12
ldp x8,x12,[$inp],#16 // inp[0:1]
ldp x9,x13,[$inp],#48
ld1 {$R0,$R1,$S1,$R2},[x15],#64
ld1 {$S2,$R3,$S3,$R4},[x15],#64
ld1 {$S4},[x15]
#ifdef __ARMEB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
and x5,x9,#0x03ffffff
ubfx x6,x8,#26,#26
ubfx x7,x9,#26,#26
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
extr x8,x12,x8,#52
extr x9,x13,x9,#52
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
fmov $IN01_0,x4
and x8,x8,#0x03ffffff
and x9,x9,#0x03ffffff
ubfx x10,x12,#14,#26
ubfx x11,x13,#14,#26
add x12,$padbit,x12,lsr#40
add x13,$padbit,x13,lsr#40
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
fmov $IN01_1,x6
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
fmov $IN01_2,x8
fmov $IN01_3,x10
fmov $IN01_4,x12
b.ls .Lskip_loop
.align 4
.Loop_neon:
////////////////////////////////////////////////////////////////
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
// \___________________/
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
// \___________________/ \____________________/
//
// Note that we start with inp[2:3]*r^2. This is because it
// doesn't depend on reduction in previous iteration.
////////////////////////////////////////////////////////////////
// d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
// d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
// d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
// d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
subs $len,$len,#64
umull $ACC4,$IN23_0,${R4}[2]
csel $in2,$zeros,$in2,lo
umull $ACC3,$IN23_0,${R3}[2]
umull $ACC2,$IN23_0,${R2}[2]
ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
umull $ACC1,$IN23_0,${R1}[2]
ldp x9,x13,[$in2],#48
umull $ACC0,$IN23_0,${R0}[2]
#ifdef __ARMEB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
umlal $ACC4,$IN23_1,${R3}[2]
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
umlal $ACC3,$IN23_1,${R2}[2]
and x5,x9,#0x03ffffff
umlal $ACC2,$IN23_1,${R1}[2]
ubfx x6,x8,#26,#26
umlal $ACC1,$IN23_1,${R0}[2]
ubfx x7,x9,#26,#26
umlal $ACC0,$IN23_1,${S4}[2]
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
umlal $ACC4,$IN23_2,${R2}[2]
extr x8,x12,x8,#52
umlal $ACC3,$IN23_2,${R1}[2]
extr x9,x13,x9,#52
umlal $ACC2,$IN23_2,${R0}[2]
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
umlal $ACC1,$IN23_2,${S4}[2]
fmov $IN23_0,x4
umlal $ACC0,$IN23_2,${S3}[2]
and x8,x8,#0x03ffffff
umlal $ACC4,$IN23_3,${R1}[2]
and x9,x9,#0x03ffffff
umlal $ACC3,$IN23_3,${R0}[2]
ubfx x10,x12,#14,#26
umlal $ACC2,$IN23_3,${S4}[2]
ubfx x11,x13,#14,#26
umlal $ACC1,$IN23_3,${S3}[2]
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
umlal $ACC0,$IN23_3,${S2}[2]
fmov $IN23_1,x6
add $IN01_2,$IN01_2,$H2
add x12,$padbit,x12,lsr#40
umlal $ACC4,$IN23_4,${R0}[2]
add x13,$padbit,x13,lsr#40
umlal $ACC3,$IN23_4,${S4}[2]
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
umlal $ACC2,$IN23_4,${S3}[2]
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
umlal $ACC1,$IN23_4,${S2}[2]
fmov $IN23_2,x8
umlal $ACC0,$IN23_4,${S1}[2]
fmov $IN23_3,x10
////////////////////////////////////////////////////////////////
// (hash+inp[0:1])*r^4 and accumulate
add $IN01_0,$IN01_0,$H0
fmov $IN23_4,x12
umlal $ACC3,$IN01_2,${R1}[0]
ldp x8,x12,[$inp],#16 // inp[0:1]
umlal $ACC0,$IN01_2,${S3}[0]
ldp x9,x13,[$inp],#48
umlal $ACC4,$IN01_2,${R2}[0]
umlal $ACC1,$IN01_2,${S4}[0]
umlal $ACC2,$IN01_2,${R0}[0]
#ifdef __ARMEB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
add $IN01_1,$IN01_1,$H1
umlal $ACC3,$IN01_0,${R3}[0]
umlal $ACC4,$IN01_0,${R4}[0]
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
umlal $ACC2,$IN01_0,${R2}[0]
and x5,x9,#0x03ffffff
umlal $ACC0,$IN01_0,${R0}[0]
ubfx x6,x8,#26,#26
umlal $ACC1,$IN01_0,${R1}[0]
ubfx x7,x9,#26,#26
add $IN01_3,$IN01_3,$H3
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
umlal $ACC3,$IN01_1,${R2}[0]
extr x8,x12,x8,#52
umlal $ACC4,$IN01_1,${R3}[0]
extr x9,x13,x9,#52
umlal $ACC0,$IN01_1,${S4}[0]
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
umlal $ACC2,$IN01_1,${R1}[0]
fmov $IN01_0,x4
umlal $ACC1,$IN01_1,${R0}[0]
and x8,x8,#0x03ffffff
add $IN01_4,$IN01_4,$H4
and x9,x9,#0x03ffffff
umlal $ACC3,$IN01_3,${R0}[0]
ubfx x10,x12,#14,#26
umlal $ACC0,$IN01_3,${S2}[0]
ubfx x11,x13,#14,#26
umlal $ACC4,$IN01_3,${R1}[0]
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
umlal $ACC1,$IN01_3,${S3}[0]
fmov $IN01_1,x6
umlal $ACC2,$IN01_3,${S4}[0]
add x12,$padbit,x12,lsr#40
umlal $ACC3,$IN01_4,${S4}[0]
add x13,$padbit,x13,lsr#40
umlal $ACC0,$IN01_4,${S1}[0]
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
umlal $ACC4,$IN01_4,${R0}[0]
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
umlal $ACC1,$IN01_4,${S2}[0]
fmov $IN01_2,x8
umlal $ACC2,$IN01_4,${S3}[0]
fmov $IN01_3,x10
/////////////////////////////////////////////////////////////////
// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
// and P. Schwabe
ushr $T0.2d,$ACC3,#26
fmov $IN01_4,x12
xtn $H3,$ACC3
ushr $T1.2d,$ACC0,#26
xtn $H0,$ACC0
add $ACC4,$ACC4,$T0.2d // h3 -> h4
bic $H3,#0xfc,lsl#24 // &=0x03ffffff
add $ACC1,$ACC1,$T1.2d // h0 -> h1
bic $H0,#0xfc,lsl#24
shrn $T0.2s,$ACC4,#26
xtn $H4,$ACC4
ushr $T1.2d,$ACC1,#26
xtn $H1,$ACC1
add $ACC2,$ACC2,$T1.2d // h1 -> h2
bic $H4,#0xfc,lsl#24
bic $H1,#0xfc,lsl#24
add $H0,$H0,$T0.2s
shl $T0.2s,$T0.2s,#2
shrn $T1.2s,$ACC2,#26
xtn $H2,$ACC2
add $H0,$H0,$T0.2s // h4 -> h0
add $H3,$H3,$T1.2s // h2 -> h3
bic $H2,#0xfc,lsl#24
ushr $T0.2s,$H0,#26
bic $H0,#0xfc,lsl#24
ushr $T1.2s,$H3,#26
bic $H3,#0xfc,lsl#24
add $H1,$H1,$T0.2s // h0 -> h1
add $H4,$H4,$T1.2s // h3 -> h4
b.hi .Loop_neon
.Lskip_loop:
dup $IN23_2,${IN23_2}[0]
movi $MASK.2d,#-1
add $IN01_2,$IN01_2,$H2
ushr $MASK.2d,$MASK.2d,#38
////////////////////////////////////////////////////////////////
// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
adds $len,$len,#32
b.ne .Long_tail
dup $IN23_2,${IN01_2}[0]
add $IN23_0,$IN01_0,$H0
add $IN23_3,$IN01_3,$H3
add $IN23_1,$IN01_1,$H1
add $IN23_4,$IN01_4,$H4
.Long_tail:
dup $IN23_0,${IN23_0}[0]
umull2 $ACC0,$IN23_2,${S3}
umull2 $ACC3,$IN23_2,${R1}
umull2 $ACC4,$IN23_2,${R2}
umull2 $ACC2,$IN23_2,${R0}
umull2 $ACC1,$IN23_2,${S4}
dup $IN23_1,${IN23_1}[0]
umlal2 $ACC0,$IN23_0,${R0}
umlal2 $ACC2,$IN23_0,${R2}
umlal2 $ACC3,$IN23_0,${R3}
umlal2 $ACC4,$IN23_0,${R4}
umlal2 $ACC1,$IN23_0,${R1}
dup $IN23_3,${IN23_3}[0]
umlal2 $ACC0,$IN23_1,${S4}
umlal2 $ACC3,$IN23_1,${R2}
umlal2 $ACC2,$IN23_1,${R1}
umlal2 $ACC4,$IN23_1,${R3}
umlal2 $ACC1,$IN23_1,${R0}
dup $IN23_4,${IN23_4}[0]
umlal2 $ACC3,$IN23_3,${R0}
umlal2 $ACC4,$IN23_3,${R1}
umlal2 $ACC0,$IN23_3,${S2}
umlal2 $ACC1,$IN23_3,${S3}
umlal2 $ACC2,$IN23_3,${S4}
umlal2 $ACC3,$IN23_4,${S4}
umlal2 $ACC0,$IN23_4,${S1}
umlal2 $ACC4,$IN23_4,${R0}
umlal2 $ACC1,$IN23_4,${S2}
umlal2 $ACC2,$IN23_4,${S3}
b.eq .Lshort_tail
////////////////////////////////////////////////////////////////
// (hash+inp[0:1])*r^4:r^3 and accumulate
add $IN01_0,$IN01_0,$H0
umlal $ACC3,$IN01_2,${R1}
umlal $ACC0,$IN01_2,${S3}
umlal $ACC4,$IN01_2,${R2}
umlal $ACC1,$IN01_2,${S4}
umlal $ACC2,$IN01_2,${R0}
add $IN01_1,$IN01_1,$H1
umlal $ACC3,$IN01_0,${R3}
umlal $ACC0,$IN01_0,${R0}
umlal $ACC4,$IN01_0,${R4}
umlal $ACC1,$IN01_0,${R1}
umlal $ACC2,$IN01_0,${R2}
add $IN01_3,$IN01_3,$H3
umlal $ACC3,$IN01_1,${R2}
umlal $ACC0,$IN01_1,${S4}
umlal $ACC4,$IN01_1,${R3}
umlal $ACC1,$IN01_1,${R0}
umlal $ACC2,$IN01_1,${R1}
add $IN01_4,$IN01_4,$H4
umlal $ACC3,$IN01_3,${R0}
umlal $ACC0,$IN01_3,${S2}
umlal $ACC4,$IN01_3,${R1}
umlal $ACC1,$IN01_3,${S3}
umlal $ACC2,$IN01_3,${S4}
umlal $ACC3,$IN01_4,${S4}
umlal $ACC0,$IN01_4,${S1}
umlal $ACC4,$IN01_4,${R0}
umlal $ACC1,$IN01_4,${S2}
umlal $ACC2,$IN01_4,${S3}
.Lshort_tail:
////////////////////////////////////////////////////////////////
// lazy reduction, but without narrowing
ushr $T0.2d,$ACC3,#26
and $ACC3,$ACC3,$MASK.2d
ushr $T1.2d,$ACC0,#26
and $ACC0,$ACC0,$MASK.2d
add $ACC4,$ACC4,$T0.2d // h3 -> h4
add $ACC1,$ACC1,$T1.2d // h0 -> h1
ushr $T0.2d,$ACC4,#26
and $ACC4,$ACC4,$MASK.2d
ushr $T1.2d,$ACC1,#26
and $ACC1,$ACC1,$MASK.2d
add $ACC2,$ACC2,$T1.2d // h1 -> h2
add $ACC0,$ACC0,$T0.2d
shl $T0.2d,$T0.2d,#2
ushr $T1.2d,$ACC2,#26
and $ACC2,$ACC2,$MASK.2d
add $ACC0,$ACC0,$T0.2d // h4 -> h0
add $ACC3,$ACC3,$T1.2d // h2 -> h3
ushr $T0.2d,$ACC0,#26
and $ACC0,$ACC0,$MASK.2d
ushr $T1.2d,$ACC3,#26
and $ACC3,$ACC3,$MASK.2d
add $ACC1,$ACC1,$T0.2d // h0 -> h1
add $ACC4,$ACC4,$T1.2d // h3 -> h4
////////////////////////////////////////////////////////////////
// horizontal add
addp $ACC2,$ACC2,$ACC2
ldp d8,d9,[sp,#16] // meet ABI requirements
addp $ACC0,$ACC0,$ACC0
ldp d10,d11,[sp,#32]
addp $ACC1,$ACC1,$ACC1
ldp d12,d13,[sp,#48]
addp $ACC3,$ACC3,$ACC3
ldp d14,d15,[sp,#64]
addp $ACC4,$ACC4,$ACC4
////////////////////////////////////////////////////////////////
// write the result, can be partially reduced
st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
st1 {$ACC4}[0],[$ctx]
.Lno_data_neon:
ldr x29,[sp],#80
ret
.size poly1305_blocks_neon,.-poly1305_blocks_neon
.type poly1305_emit_neon,%function
.align 5
poly1305_emit_neon:
ldr $is_base2_26,[$ctx,#24]
cbz $is_base2_26,poly1305_emit
ldp w10,w11,[$ctx] // load hash value base 2^26
ldp w12,w13,[$ctx,#8]
ldr w14,[$ctx,#16]
add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
lsr $h1,x12,#12
adds $h0,$h0,x12,lsl#52
add $h1,$h1,x13,lsl#14
adc $h1,$h1,xzr
lsr $h2,x14,#24
adds $h1,$h1,x14,lsl#40
adc $h2,$h2,xzr // can be partially reduced...
ldp $t0,$t1,[$nonce] // load nonce
and $d0,$h2,#-4 // ... so reduce
add $d0,$d0,$h2,lsr#2
and $h2,$h2,#3
adds $h0,$h0,$d0
adc $h1,$h1,xzr
adds $d0,$h0,#5 // compare to modulus
adcs $d1,$h1,xzr
adc $d2,$h2,xzr
tst $d2,#-4 // see if it's carried/borrowed
csel $h0,$h0,$d0,eq
csel $h1,$h1,$d1,eq
#ifdef __ARMEB__
ror $t0,$t0,#32 // flip nonce words
ror $t1,$t1,#32
#endif
adds $h0,$h0,$t0 // accumulate nonce
adc $h1,$h1,$t1
#ifdef __ARMEB__
rev $h0,$h0 // flip output bytes
rev $h1,$h1
#endif
stp $h0,$h1,[$mac] // write result
ret
.size poly1305_emit_neon,.-poly1305_emit_neon
.align 5
.Lzeros:
.long 0,0,0,0,0,0,0,0
.LOPENSSL_armcap_P:
#ifdef __ILP32__
.long OPENSSL_armcap_P-.
#else
.quad OPENSSL_armcap_P-.
#endif
.asciz "Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
foreach (split("\n",$code)) {
s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
(m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
(m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
(m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
(m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
(m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
s/\.[124]([sd])\[/.$1\[/;
print $_,"\n";
}
close STDOUT;
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册