提交 6c1b0da1 编写于 作者: A Ard Biesheuvel 提交者: Herbert Xu

crypto: arm64/crct10dif - preparatory refactor for 8x8 PMULL version

Reorganize the CRC-T10DIF asm routine so we can easily instantiate an
alternative version based on 8x8 polynomial multiplication in a
subsequent patch.
Signed-off-by: NArd Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: NHerbert Xu <herbert@gondor.apana.org.au>
上级 598b7d41
......@@ -80,7 +80,46 @@
vzr .req v13
ENTRY(crc_t10dif_pmull)
.macro fold64, p, reg1, reg2
ldp q11, q12, [arg2], #0x20
__pmull_\p v8, \reg1, v10, 2
__pmull_\p \reg1, \reg1, v10
CPU_LE( rev64 v11.16b, v11.16b )
CPU_LE( rev64 v12.16b, v12.16b )
__pmull_\p v9, \reg2, v10, 2
__pmull_\p \reg2, \reg2, v10
CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
eor \reg1\().16b, \reg1\().16b, v8.16b
eor \reg2\().16b, \reg2\().16b, v9.16b
eor \reg1\().16b, \reg1\().16b, v11.16b
eor \reg2\().16b, \reg2\().16b, v12.16b
.endm
.macro fold16, p, reg, rk
__pmull_\p v8, \reg, v10
__pmull_\p \reg, \reg, v10, 2
.ifnb \rk
ldr_l q10, \rk, x8
.endif
eor v7.16b, v7.16b, v8.16b
eor v7.16b, v7.16b, \reg\().16b
.endm
.macro __pmull_p64, rd, rn, rm, n
.ifb \n
pmull \rd\().1q, \rn\().1d, \rm\().1d
.else
pmull2 \rd\().1q, \rn\().2d, \rm\().2d
.endif
.endm
.macro crc_t10dif_pmull, p
frame_push 3, 128
mov arg1_low32, w0
......@@ -96,7 +135,7 @@ ENTRY(crc_t10dif_pmull)
cmp arg3, #256
// for sizes less than 128, we can't fold 64B at a time...
b.lt _less_than_128
b.lt .L_less_than_128_\@
// load the initial crc value
// crc value does not need to be byte-reflected, but it needs
......@@ -147,41 +186,19 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
// buffer. The _fold_64_B_loop will fold 64B at a time
// until we have 64+y Bytes of buffer
// fold 64B at a time. This section of the code folds 4 vector
// registers in parallel
_fold_64_B_loop:
.macro fold64, reg1, reg2
ldp q11, q12, [arg2], #0x20
pmull2 v8.1q, \reg1\().2d, v10.2d
pmull \reg1\().1q, \reg1\().1d, v10.1d
CPU_LE( rev64 v11.16b, v11.16b )
CPU_LE( rev64 v12.16b, v12.16b )
pmull2 v9.1q, \reg2\().2d, v10.2d
pmull \reg2\().1q, \reg2\().1d, v10.1d
CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
eor \reg1\().16b, \reg1\().16b, v8.16b
eor \reg2\().16b, \reg2\().16b, v9.16b
eor \reg1\().16b, \reg1\().16b, v11.16b
eor \reg2\().16b, \reg2\().16b, v12.16b
.endm
.L_fold_64_B_loop_\@:
fold64 v0, v1
fold64 v2, v3
fold64 v4, v5
fold64 v6, v7
fold64 \p, v0, v1
fold64 \p, v2, v3
fold64 \p, v4, v5
fold64 \p, v6, v7
subs arg3, arg3, #128
// check if there is another 64B in the buffer to be able to fold
b.lt _fold_64_B_end
b.lt .L_fold_64_B_end_\@
if_will_cond_yield_neon
stp q0, q1, [sp, #.Lframe_local_offset]
......@@ -197,9 +214,9 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
movi vzr.16b, #0 // init zero register
endif_yield_neon
b _fold_64_B_loop
b .L_fold_64_B_loop_\@
_fold_64_B_end:
.L_fold_64_B_end_\@:
// at this point, the buffer pointer is pointing at the last y Bytes
// of the buffer the 64B of folded data is in 4 of the vector
// registers: v0, v1, v2, v3
......@@ -209,37 +226,27 @@ _fold_64_B_end:
ldr_l q10, rk9, x8
.macro fold16, reg, rk
pmull v8.1q, \reg\().1d, v10.1d
pmull2 \reg\().1q, \reg\().2d, v10.2d
.ifnb \rk
ldr_l q10, \rk, x8
.endif
eor v7.16b, v7.16b, v8.16b
eor v7.16b, v7.16b, \reg\().16b
.endm
fold16 v0, rk11
fold16 v1, rk13
fold16 v2, rk15
fold16 v3, rk17
fold16 v4, rk19
fold16 v5, rk1
fold16 v6
fold16 \p, v0, rk11
fold16 \p, v1, rk13
fold16 \p, v2, rk15
fold16 \p, v3, rk17
fold16 \p, v4, rk19
fold16 \p, v5, rk1
fold16 \p, v6
// instead of 64, we add 48 to the loop counter to save 1 instruction
// from the loop instead of a cmp instruction, we use the negative
// flag with the jl instruction
adds arg3, arg3, #(128-16)
b.lt _final_reduction_for_128
b.lt .L_final_reduction_for_128_\@
// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
// and the rest is in memory. We can fold 16 bytes at a time if y>=16
// continue folding 16B at a time
_16B_reduction_loop:
pmull v8.1q, v7.1d, v10.1d
pmull2 v7.1q, v7.2d, v10.2d
.L_16B_reduction_loop_\@:
__pmull_\p v8, v7, v10
__pmull_\p v7, v7, v10, 2
eor v7.16b, v7.16b, v8.16b
ldr q0, [arg2], #16
......@@ -251,22 +258,22 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
// instead of a cmp instruction, we utilize the flags with the
// jge instruction equivalent of: cmp arg3, 16-16
// check if there is any more 16B in the buffer to be able to fold
b.ge _16B_reduction_loop
b.ge .L_16B_reduction_loop_\@
// now we have 16+z bytes left to reduce, where 0<= z < 16.
// first, we reduce the data in the xmm7 register
_final_reduction_for_128:
.L_final_reduction_for_128_\@:
// check if any more data to fold. If not, compute the CRC of
// the final 128 bits
adds arg3, arg3, #16
b.eq _128_done
b.eq .L_128_done_\@
// here we are getting data that is less than 16 bytes.
// since we know that there was data before the pointer, we can
// offset the input pointer before the actual point, to receive
// exactly 16 bytes. after that the registers need to be adjusted.
_get_last_two_regs:
.L_get_last_two_regs_\@:
add arg2, arg2, arg3
ldr q1, [arg2, #-16]
CPU_LE( rev64 v1.16b, v1.16b )
......@@ -291,47 +298,46 @@ CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
bsl v0.16b, v2.16b, v1.16b
// fold 16 Bytes
pmull v8.1q, v7.1d, v10.1d
pmull2 v7.1q, v7.2d, v10.2d
__pmull_\p v8, v7, v10
__pmull_\p v7, v7, v10, 2
eor v7.16b, v7.16b, v8.16b
eor v7.16b, v7.16b, v0.16b
_128_done:
.L_128_done_\@:
// compute crc of a 128-bit value
ldr_l q10, rk5, x8 // rk5 and rk6 in xmm10
// 64b fold
ext v0.16b, vzr.16b, v7.16b, #8
mov v7.d[0], v7.d[1]
pmull v7.1q, v7.1d, v10.1d
__pmull_\p v7, v7, v10
eor v7.16b, v7.16b, v0.16b
// 32b fold
ext v0.16b, v7.16b, vzr.16b, #4
mov v7.s[3], vzr.s[0]
pmull2 v0.1q, v0.2d, v10.2d
__pmull_\p v0, v0, v10, 2
eor v7.16b, v7.16b, v0.16b
// barrett reduction
_barrett:
ldr_l q10, rk7, x8
mov v0.d[0], v7.d[1]
pmull v0.1q, v0.1d, v10.1d
__pmull_\p v0, v0, v10
ext v0.16b, vzr.16b, v0.16b, #12
pmull2 v0.1q, v0.2d, v10.2d
__pmull_\p v0, v0, v10, 2
ext v0.16b, vzr.16b, v0.16b, #12
eor v7.16b, v7.16b, v0.16b
mov w0, v7.s[1]
_cleanup:
.L_cleanup_\@:
// scale the result back to 16 bits
lsr x0, x0, #16
frame_pop
ret
_less_than_128:
cbz arg3, _cleanup
.L_less_than_128_\@:
cbz arg3, .L_cleanup_\@
movi v0.16b, #0
mov v0.s[3], arg1_low32 // get the initial crc value
......@@ -342,20 +348,20 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
eor v7.16b, v7.16b, v0.16b // xor the initial crc value
cmp arg3, #16
b.eq _128_done // exactly 16 left
b.lt _less_than_16_left
b.eq .L_128_done_\@ // exactly 16 left
b.lt .L_less_than_16_left_\@
ldr_l q10, rk1, x8 // rk1 and rk2 in xmm10
// update the counter. subtract 32 instead of 16 to save one
// instruction from the loop
subs arg3, arg3, #32
b.ge _16B_reduction_loop
b.ge .L_16B_reduction_loop_\@
add arg3, arg3, #16
b _get_last_two_regs
b .L_get_last_two_regs_\@
_less_than_16_left:
.L_less_than_16_left_\@:
// shl r9, 4
adr_l x0, tbl_shf_table + 16
sub x0, x0, arg3
......@@ -363,8 +369,12 @@ _less_than_16_left:
movi v9.16b, #0x80
eor v0.16b, v0.16b, v9.16b
tbl v7.16b, {v7.16b}, v0.16b
b _128_done
ENDPROC(crc_t10dif_pmull)
b .L_128_done_\@
.endm
ENTRY(crc_t10dif_pmull_p64)
crc_t10dif_pmull p64
ENDPROC(crc_t10dif_pmull_p64)
// precomputed constants
// these constants are precomputed from the poly:
......
......@@ -22,7 +22,9 @@
#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 buf[], u64 len);
asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 buf[], u64 len);
static u16 (*crc_t10dif_pmull)(u16 init_crc, const u8 buf[], u64 len);
static int crct10dif_init(struct shash_desc *desc)
{
......@@ -85,6 +87,8 @@ static struct shash_alg crc_t10dif_alg = {
static int __init crc_t10dif_mod_init(void)
{
crc_t10dif_pmull = crc_t10dif_pmull_p64;
return crypto_register_shash(&crc_t10dif_alg);
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册