提交 f596bbe4 编写于 作者: D Deepankar Bhattacharjee 提交者: Pauli

chacha20 performance optimizations for ppc64le with 8x lanes,

Performance increase around 50%.
Co-authored-by: NMadhusudhanan Duraisamy <madurais@in.ibm.com>
Co-authored-by: NNilamjyoti Goswami <nilamgoswami@in.ibm.com>
Co-authored-by: NSiva Sundar Anbareeswaran <srisivasundar@in.ibm.com>
Reviewed-by: NDanny Tsen <dtsen@us.ibm.com>
Tested-by: NDanny Tsen <dtsen@us.ibm.com>
Signed-off-by: NDanny <dtsen@us.ibm.com>
Reviewed-by: NTomas Mraz <tomas@openssl.org>
Reviewed-by: NPaul Dale <pauli@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/16637)
上级 7b3041eb
此差异已折叠。
......@@ -12,7 +12,7 @@ IF[{- !$disabled{asm} -}]
$CHACHAASM_armv4=chacha-armv4.S
$CHACHAASM_aarch64=chacha-armv8.S
$CHACHAASM_ppc32=chacha_ppc.c chacha-ppc.s
$CHACHAASM_ppc32=chacha_ppc.c chacha-ppc.s chachap10-ppc.s
$CHACHAASM_ppc64=$CHACHAASM_ppc32
$CHACHAASM_c64xplus=chacha-c64xplus.s
......@@ -29,6 +29,7 @@ SOURCE[../../libcrypto]=$CHACHAASM
GENERATE[chacha-x86.s]=asm/chacha-x86.pl
GENERATE[chacha-x86_64.s]=asm/chacha-x86_64.pl
GENERATE[chacha-ppc.s]=asm/chacha-ppc.pl
GENERATE[chachap10-ppc.s]=asm/chachap10-ppc.pl
GENERATE[chacha-armv4.S]=asm/chacha-armv4.pl
INCLUDE[chacha-armv4.o]=..
GENERATE[chacha-armv8.S]=asm/chacha-armv8.pl
......
......@@ -23,13 +23,18 @@ void ChaCha20_ctr32_vmx(unsigned char *out, const unsigned char *inp,
void ChaCha20_ctr32_vsx(unsigned char *out, const unsigned char *inp,
size_t len, const unsigned int key[8],
const unsigned int counter[4]);
void ChaCha20_ctr32_vsx_p10(unsigned char *out, const unsigned char *inp,
size_t len, const unsigned int key[8],
const unsigned int counter[4]);
void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp,
size_t len, const unsigned int key[8],
const unsigned int counter[4])
{
OPENSSL_ppccap_P & PPC_CRYPTO207
? ChaCha20_ctr32_vsx(out, inp, len, key, counter)
: OPENSSL_ppccap_P & PPC_ALTIVEC
? ChaCha20_ctr32_vmx(out, inp, len, key, counter)
: ChaCha20_ctr32_int(out, inp, len, key, counter);
OPENSSL_ppccap_P & PPC_BRD31
? ChaCha20_ctr32_vsx_p10(out, inp, len, key, counter)
:OPENSSL_ppccap_P & PPC_CRYPTO207
? ChaCha20_ctr32_vsx(out, inp, len, key, counter)
: OPENSSL_ppccap_P & PPC_ALTIVEC
? ChaCha20_ctr32_vmx(out, inp, len, key, counter)
: ChaCha20_ctr32_int(out, inp, len, key, counter);
}
......@@ -293,6 +293,14 @@ my $vpermdi = sub { # xxpermdi
$dm = oct($dm) if ($dm =~ /^0/);
" .long ".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|($dm<<8)|(10<<3)|7;
};
my $vxxlor = sub { # xxlor
my ($f, $vrt, $vra, $vrb) = @_;
" .long ".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|(146<<3)|6;
};
my $vxxlorc = sub { # xxlor
my ($f, $vrt, $vra, $vrb) = @_;
" .long ".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|(146<<3)|1;
};
# PowerISA 2.07 stuff
sub vcrypto_op {
......@@ -377,6 +385,15 @@ my $addex = sub {
};
my $vmsumudm = sub { vfour_vsr(@_, 35); };
# PowerISA 3.1 stuff
my $brd = sub {
my ($f, $ra, $rs) = @_;
" .long ".sprintf "0x%X",(31<<26)|($rs<<21)|($ra<<16)|(187<<1);
};
my $vsrq = sub { vcrypto_op(@_, 517); };
while($line=<>) {
$line =~ s|[#!;].*$||; # get rid of asm-style comments...
......
......@@ -45,6 +45,7 @@ void OPENSSL_ppc64_probe(void);
void OPENSSL_altivec_probe(void);
void OPENSSL_crypto207_probe(void);
void OPENSSL_madd300_probe(void);
void OPENSSL_brd31_probe(void);
long OPENSSL_rdtsc_mftb(void);
long OPENSSL_rdtsc_mfspr268(void);
......@@ -131,6 +132,7 @@ static unsigned long getauxval(unsigned long key)
#endif
#define HWCAP_VEC_CRYPTO (1U << 25)
#define HWCAP_ARCH_3_00 (1U << 23)
#define HWCAP_ARCH_3_1 (1U << 18)
# if defined(__GNUC__) && __GNUC__>=2
__attribute__ ((constructor))
......@@ -191,6 +193,9 @@ void OPENSSL_cpuid_setup(void)
if (__power_set(0xffffffffU<<17)) /* POWER9 and later */
OPENSSL_ppccap_P |= PPC_MADD300;
if (__power_set(0xffffffffU<<18)) /* POWER10 and later */
OPENSSL_ppccap_P |= PPC_BRD31;
return;
# endif
#endif
......@@ -246,6 +251,10 @@ void OPENSSL_cpuid_setup(void)
if (hwcap2 & HWCAP_ARCH_3_00) {
OPENSSL_ppccap_P |= PPC_MADD300;
}
if (hwcap2 & HWCAP_ARCH_3_1) {
OPENSSL_ppccap_P |= PPC_BRD31;
}
}
#endif
......
......@@ -81,6 +81,17 @@ $code=<<___;
.long 0
.byte 0,12,0x14,0,0,0,0,0
.globl .OPENSSL_brd31_probe
.align 4
.OPENSSL_brd31_probe:
xor r0,r0,r0
brd r3,r0
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.size .OPENSSL_brd31_probe,.-.OPENSSL_brd31_probe
.globl .OPENSSL_wipe_cpu
.align 4
.OPENSSL_wipe_cpu:
......
......@@ -24,5 +24,6 @@ extern unsigned int OPENSSL_ppccap_P;
# define PPC_MADD300 (1<<4)
# define PPC_MFTB (1<<5)
# define PPC_MFSPR268 (1<<6)
# define PPC_BRD31 (1<<7)
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册