From 0edb109f97c1bbbd5961326f93b2ccf385b26674 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Tue, 3 Jul 2018 21:34:08 +0200 Subject: [PATCH] evp/e_chacha20_poly1305.c: further improve small-fragment TLS performance. Improvement coefficients vary with TLS fragment length and platform, on most Intel processors maximum improvement is ~50%, while on Ryzen - 80%. The "secret" is new dedicated ChaCha20_128 code path and vectorized xor helpers. Reviewed-by: Rich Salz (Merged from https://github.com/openssl/openssl/pull/6638) --- crypto/evp/e_chacha20_poly1305.c | 54 +++++++++++-- crypto/poly1305/asm/poly1305-x86_64.pl | 104 +++++++++++++++++++++++++ 2 files changed, 150 insertions(+), 8 deletions(-) diff --git a/crypto/evp/e_chacha20_poly1305.c b/crypto/evp/e_chacha20_poly1305.c index 47d5e50d45..6a9bccfe33 100644 --- a/crypto/evp/e_chacha20_poly1305.c +++ b/crypto/evp/e_chacha20_poly1305.c @@ -196,14 +196,23 @@ static int chacha20_poly1305_init_key(EVP_CIPHER_CTX *ctx, } # if !defined(OPENSSL_SMALL_FOOTPRINT) + +# if defined(POLY1305_ASM) && (defined(__x86_64) || defined(__x86_64__) || \ + defined(_M_AMD64) || defined(_M_X64)) +# define XOR128_HELPERS +void *xor128_encrypt_n_pad(void *out, const void *inp, void *otp, size_t len); +void *xor128_decrypt_n_pad(void *out, const void *inp, void *otp, size_t len); +static const unsigned char zero[4 * CHACHA_BLK_SIZE] = { 0 }; +# else static const unsigned char zero[2 * CHACHA_BLK_SIZE] = { 0 }; +# endif static int chacha20_poly1305_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, const unsigned char *in, size_t len) { EVP_CHACHA_AEAD_CTX *actx = aead_data(ctx); - size_t i, tail, tohash_len, plen = actx->tls_payload_length; - unsigned char *buf, *tohash, *ctr, storage[2 * CHACHA_BLK_SIZE + 32]; + size_t tail, tohash_len, buf_len, plen = actx->tls_payload_length; + unsigned char *buf, *tohash, *ctr, storage[sizeof(zero) + 32]; if (len != plen + POLY1305_BLOCK_SIZE) return -1; @@ -212,9 +221,11 @@ static int chacha20_poly1305_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, ctr = buf + CHACHA_BLK_SIZE; tohash = buf + CHACHA_BLK_SIZE - POLY1305_BLOCK_SIZE; - if (plen <= CHACHA_BLK_SIZE) { +# ifdef XOR128_HELPERS + if (plen <= 3 * CHACHA_BLK_SIZE) { actx->key.counter[0] = 0; - ChaCha20_ctr32(buf, zero, 2 * CHACHA_BLK_SIZE, actx->key.key.d, + buf_len = (plen + 2 * CHACHA_BLK_SIZE - 1) & (0 - CHACHA_BLK_SIZE); + ChaCha20_ctr32(buf, zero, buf_len, actx->key.key.d, actx->key.counter); Poly1305_Init(POLY1305_ctx(actx), buf); actx->key.partial_len = 0; @@ -223,6 +234,31 @@ static int chacha20_poly1305_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, actx->len.aad = EVP_AEAD_TLS1_AAD_LEN; actx->len.text = plen; + if (plen) { + if (ctx->encrypt) + ctr = xor128_encrypt_n_pad(out, in, ctr, plen); + else + ctr = xor128_decrypt_n_pad(out, in, ctr, plen); + + in += plen; + out += plen; + tohash_len = (size_t)(ctr - tohash); + } + } +# else + if (plen <= CHACHA_BLK_SIZE) { + size_t i; + + actx->key.counter[0] = 0; + ChaCha20_ctr32(buf, zero, (buf_len = 2 * CHACHA_BLK_SIZE), + actx->key.key.d, actx->key.counter); + Poly1305_Init(POLY1305_ctx(actx), buf); + actx->key.partial_len = 0; + memcpy(tohash, actx->tls_aad, POLY1305_BLOCK_SIZE); + tohash_len = POLY1305_BLOCK_SIZE; + actx->len.aad = EVP_AEAD_TLS1_AAD_LEN; + actx->len.text = plen; + if (ctx->encrypt) { for (i = 0; i < plen; i++) { out[i] = ctr[i] ^= in[i]; @@ -242,10 +278,12 @@ static int chacha20_poly1305_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, memset(ctr + i, 0, tail); ctr += i + tail; tohash_len += i + tail; - } else { + } +# endif + else { actx->key.counter[0] = 0; - ChaCha20_ctr32(buf, zero, CHACHA_BLK_SIZE, actx->key.key.d, - actx->key.counter); + ChaCha20_ctr32(buf, zero, (buf_len = CHACHA_BLK_SIZE), + actx->key.key.d, actx->key.counter); Poly1305_Init(POLY1305_ctx(actx), buf); actx->key.counter[0] = 1; actx->key.partial_len = 0; @@ -300,7 +338,7 @@ static int chacha20_poly1305_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, } Poly1305_Update(POLY1305_ctx(actx), tohash, tohash_len); - OPENSSL_cleanse(buf, 2 * CHACHA_BLK_SIZE); + OPENSSL_cleanse(buf, buf_len); Poly1305_Final(POLY1305_ctx(actx), ctx->encrypt ? actx->tag : tohash); diff --git a/crypto/poly1305/asm/poly1305-x86_64.pl b/crypto/poly1305/asm/poly1305-x86_64.pl index 0d1c0de645..0b4c56e5af 100755 --- a/crypto/poly1305/asm/poly1305-x86_64.pl +++ b/crypto/poly1305/asm/poly1305-x86_64.pl @@ -3753,6 +3753,110 @@ poly1305_emit_base2_44: .size poly1305_emit_base2_44,.-poly1305_emit_base2_44 ___ } } } + +{ # chacha20-poly1305 helpers +my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx"); # Unix order +$code.=<<___; +.globl xor128_encrypt_n_pad +.type xor128_encrypt_n_pad,\@abi-omnipotent +.align 16 +xor128_encrypt_n_pad: + sub $otp,$inp + sub $otp,$out + mov $len,%r10 # put len aside + shr \$4,$len # len / 16 + jz .Ltail_enc + nop +.Loop_enc_xmm: + movdqu ($inp,$otp),%xmm0 + pxor ($otp),%xmm0 + movdqu %xmm0,($out,$otp) + movdqa %xmm0,($otp) + lea 16($otp),$otp + dec $len + jnz .Loop_enc_xmm + + and \$15,%r10 # len % 16 + jz .Ldone_enc + +.Ltail_enc: + mov \$16,$len + sub %r10,$len + xor %eax,%eax +.Loop_enc_byte: + mov ($inp,$otp),%al + xor ($otp),%al + mov %al,($out,$otp) + mov %al,($otp) + lea 1($otp),$otp + dec %r10 + jnz .Loop_enc_byte + + xor %eax,%eax +.Loop_enc_pad: + mov %al,($otp) + lea 1($otp),$otp + dec $len + jnz .Loop_enc_pad + +.Ldone_enc: + mov $otp,%rax + ret +.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad + +.globl xor128_decrypt_n_pad +.type xor128_decrypt_n_pad,\@abi-omnipotent +.align 16 +xor128_decrypt_n_pad: + sub $otp,$inp + sub $otp,$out + mov $len,%r10 # put len aside + shr \$4,$len # len / 16 + jz .Ltail_dec + nop +.Loop_dec_xmm: + movdqu ($inp,$otp),%xmm0 + movdqa ($otp),%xmm1 + pxor %xmm0,%xmm1 + movdqu %xmm1,($out,$otp) + movdqa %xmm0,($otp) + lea 16($otp),$otp + dec $len + jnz .Loop_dec_xmm + + pxor %xmm1,%xmm1 + and \$15,%r10 # len % 16 + jz .Ldone_dec + +.Ltail_dec: + mov \$16,$len + sub %r10,$len + xor %eax,%eax + xor %r11,%r11 +.Loop_dec_byte: + mov ($inp,$otp),%r11b + mov ($otp),%al + xor %r11b,%al + mov %al,($out,$otp) + mov %r11b,($otp) + lea 1($otp),$otp + dec %r10 + jnz .Loop_dec_byte + + xor %eax,%eax +.Loop_dec_pad: + mov %al,($otp) + lea 1($otp),$otp + dec $len + jnz .Loop_dec_pad + +.Ldone_dec: + mov $otp,%rax + ret +.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad +___ +} $code.=<<___; .align 64 .Lconst: -- GitLab