提交 20b900c4 编写于 作者: M Mao Minkai 提交者: Zheng Zengkai

sw64: optimize simd version of memcpy and memset

Sunway inclusion
category: performance
bugzilla: https://gitee.com/openeuler/kernel/issues/I56XPR

--------------------------------

Optimize the use of memb instruction in memset.

Rewrite memcpy and use simd instruction to copy data when src and dest
are not co-aligned.

When data size is larger than 2KB, use _nc store instruction to improve
performance.
Signed-off-by: NMao Minkai <maominkai@wxiat.com>
Signed-off-by: NGu Zitao <guzitao@wxiat.com>
Acked-by: NXie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: NZheng Zengkai <zhengzengkai@huawei.com>
上级 32852dde
/* SPDX-License-Identifier: GPL-2.0 */ /* SPDX-License-Identifier: GPL-2.0 */
#include <asm/export.h> #include <asm/export.h>
.set noreorder
.set noat
.align 4 #define NC_STORE_THRESHOLD 2048
#define SAVE_SIMD_REGS \
ldi $sp, -0x60($sp); \
addl $sp, 0x1f, $23; \
bic $23, 0x1f, $23; \
vstd $f1, 0($23); \
vstd $f2, 0x20($23)
#define RESTORE_SIMD_REGS \
addl $sp, 0x1f, $23; \
bic $23, 0x1f, $23; \
vldd $f1, 0($23); \
vldd $f2, 0x20($23); \
ldi $sp, 0x60($sp)
#define SAVE_SIMD_U_REGS \
ldi $sp, -0x120($sp); \
addl $sp, 0x1f, $23; \
bic $23, 0x1f, $23; \
vstd $f1, 0($23); \
vstd $f2, 0x20($23); \
vstd $f4, 0x40($23); \
vstd $f5, 0x60($23); \
vstd $f10, 0x80($23); \
vstd $f11, 0xa0($23); \
vstd $f20, 0xc0($23); \
vstd $f21, 0xe0($23)
#define RESTORE_SIMD_U_REGS \
addl $sp, 0x1f, $23; \
bic $23, 0x1f, $23; \
vldd $f1, 0($23); \
vldd $f2, 0x20($23); \
vldd $f4, 0x40($23); \
vldd $f5, 0x60($23); \
vldd $f10, 0x80($23); \
vldd $f11, 0xa0($23); \
vldd $f20, 0xc0($23); \
vldd $f21, 0xe0($23); \
ldi $sp, 0x120($sp)
.set noat
.align 4
.globl memcpy .globl memcpy
.ent memcpy .ent memcpy
memcpy: memcpy:
.frame $30, 0, $26, 0 .frame $30, 0, $26, 0
.prologue 0 .prologue 0
subl $sp, 0xa0, $sp
ldi $4, 0x40($sp)
stl $4, 0($sp)
bic $4, 0x1f, $4
vstd $f4, 0($4)
vstd $f5, 0x20($4)
mov $16, $0 mov $16, $0
ble $18, $nomoredata ble $18, $out
xor $16, $17, $1
and $1, 7, $1
bne $1, $misaligned
and $16, 7, $1 and $16, 7, $1
beq $1, $both_0mod8 beq $1, $dest_aligned_8
$head_align: .align 4
ldbu $1, 0($17) $byte_loop_head:
ldbu $2, 0($17)
subl $18, 1, $18 subl $18, 1, $18
addl $17, 1, $17 addl $17, 1, $17
stb $1, 0($16) stb $2, 0($16)
addl $16, 1, $16 addl $16, 1, $16
ble $18, $out
and $16, 7, $1 and $16, 7, $1
ble $18, $nomoredata bne $1, $byte_loop_head
bne $1, $head_align
$both_0mod8: $dest_aligned_8:
cmple $18, 127, $1 and $17, 7, $4
bne $1, $no_unroll subl $18, 16, $18
and $16, 63, $1 blt $18, $quad_end
beq $1, $do_unroll subl $18, 64, $18
blt $18, $simd_end
$single_head_quad: and $16, 31, $1
ldl $1, 0($17) beq $1, $dest_aligned_32
bne $4, $quad_u_loop_head
.align 5
$quad_loop_head:
ldl $2, 0($17)
subl $18, 8, $18 subl $18, 8, $18
addl $17, 8, $17 addl $17, 8, $17
stl $2, 0($16)
stl $1, 0($16)
addl $16, 8, $16 addl $16, 8, $16
and $16, 63, $1 and $16, 31, $1
bne $1, $single_head_quad blt $18, $simd_end
beq $16, $dest_aligned_32
$do_unroll: br $31, $quad_loop_head
addl $16, 64, $7
cmple $18, 127, $1 $dest_aligned_32:
bne $1, $tail_quads and $17, 31, $5
bne $5, $prep_simd_u_loop
#JJ
and $17, 31, $1 $prep_simd_loop:
bne $1, $unroll_body SAVE_SIMD_REGS
ldi $1, NC_STORE_THRESHOLD($31)
$unroll_body_simd: cmple $18, $1, $1
ldwe $f31,128*5($17) bne $1, $simd_loop
vldd $f4, 0($17)
vldd $f5, 32($17) .align 5
vstd_nc $f4, 0($16) $simd_loop_nc:
vstd_nc $f5, 32($16) fillcs 128 * 5($17)
vldd $f1, 0($17)
vldd $f2, 32($17)
subl $18, 64, $18
addl $17, 64, $17
vstd_nc $f1, 0($16)
vstd_nc $f2, 32($16)
addl $16, 64, $16 addl $16, 64, $16
bge $18, $simd_loop_nc
memb # required for _nc store instructions
br $31, $simd_loop_end
.align 5
$simd_loop:
fillcs 128 * 5($17)
vldd $f1, 0($17)
vldd $f2, 32($17)
subl $18, 64, $18 subl $18, 64, $18
addl $17, 64, $17 addl $17, 64, $17
cmple $18, 63, $1 vstd $f1, 0($16)
beq $1, $unroll_body_simd vstd $f2, 32($16)
memb addl $16, 64, $16
br $no_unroll bge $18, $simd_loop
#endJJ
$simd_loop_end:
$unroll_body: addl $18, 64, $1
#wh64 ($7) cmplt $1, 32, $1
#e_fillcs 0($7) bne $1, $no_more_simd
vldd $f1, 0($17)
ldl $6, 0($17) subl $18, 32, $18
#e_fillcs 256($17)
ldl $4, 8($17)
ldl $5, 16($17)
addl $7, 64, $7
ldl $3, 24($17)
addl $16, 64, $1
addl $17, 32, $17 addl $17, 32, $17
stl_nc $6, 0($16) vstd $f1, 0($16)
stl_nc $4, 8($16)
stl_nc $5, 16($16)
subl $18, 192, $2
stl_nc $3, 24($16)
addl $16, 32, $16 addl $16, 32, $16
ldl $6, 0($17) $no_more_simd:
ldwe $f31, 4*128($17) RESTORE_SIMD_REGS
#e_fillcs 288($17)
ldl $4, 8($17)
#cmovlt $2, $1, $7
sellt $2, $1, $7, $7
ldl $5, 16($17) $simd_end:
ldl $3, 24($17) addl $18, 64, $18
addl $16, 32, $16 blt $18, $quad_end
subl $18, 64, $18 bne $4, $prep_quad_u_loop_tail
addl $17, 32, $17
stl_nc $6, -32($16)
stl_nc $4, -24($16)
cmple $18, 63, $1
stl_nc $5, -16($16)
stl_nc $3, -8($16)
beq $1, $unroll_body
memb
$tail_quads:
$no_unroll:
.align 4 .align 4
subl $18, 8, $18 $quad_loop_tail:
blt $18, $less_than_8 ldl $2, 0($17)
ldl $3, 8($17)
$move_a_quad: subl $18, 16, $18
ldl $1, 0($17) addl $17, 16, $17
stl $2, 0($16)
stl $3, 8($16)
addl $16, 16, $16
bge $18, $quad_loop_tail
$quad_end:
addl $18, 16, $18
ble $18, $out
cmplt $18, 8, $1
bne $1, $byte_loop_tail
bne $4, $move_one_quad_u
$move_one_quad:
ldl $2, 0($17)
subl $18, 8, $18 subl $18, 8, $18
addl $17, 8, $17 addl $17, 8, $17
stl $2, 0($16)
stl $1, 0($16)
addl $16, 8, $16 addl $16, 8, $16
bge $18, $move_a_quad ble $18, $out
$less_than_8:
.align 4 .align 4
addl $18, 8, $18 $byte_loop_tail:
ble $18, $nomoredata ldbu $2, 0($17)
$tail_bytes:
subl $18, 1, $18 subl $18, 1, $18
ldbu $1, 0($17)
addl $17, 1, $17 addl $17, 1, $17
stb $2, 0($16)
stb $1, 0($16)
addl $16, 1, $16 addl $16, 1, $16
bgt $18, $tail_bytes bgt $18, $byte_loop_tail
ldi $4, 0x40($sp)
bic $4, 0x1f, $4
vldd $f4, 0($4)
vldd $f5, 0x20($4)
ldl $4, 0($sp)
addl $sp, 0xa0, $sp
$out:
ret $31, ($26), 1 ret $31, ($26), 1
$misaligned:
mov $0, $4
and $0, 7, $1
beq $1, $dest_0mod8
$aligndest:
ble $18, $nomoredata
ldbu $1, 0($17)
subl $18, 1, $18
addl $17, 1, $17
stb $1, 0($4)
addl $4, 1, $4
and $4, 7, $1
bne $1, $aligndest
.align 5
$dest_0mod8: $quad_u_loop_head:
ldl_u $2, 0($17)
ldl_u $3, 7($17)
subl $18, 8, $18 subl $18, 8, $18
blt $18, $misalign_tail addl $17, 8, $17
ldl_u $3, 0($17) extll $2, $4, $2
exthl $3, $4, $3
$mis_quad: bis $2, $3, $2
ldl_u $16, 8($17) stl $2, 0($16)
#extql $3, $17, $3 addl $16, 8, $16
fillde 256($17) blt $18, $simd_end
and $17, 7, $1 beq $16, $dest_aligned_32
sll $1, 3, $1 br $31, $quad_u_loop_head
srl $3, $1, $3
$prep_simd_u_loop:
#extqh $16, $17, $1 SAVE_SIMD_U_REGS
subl $1, 64, $1 andnot $17, 31, $3
negl $1, $1 ldi $2, 256($31)
sll $16, $1, $1 sll $5, 3, $1
subl $2, $1, $2
bis $3, $1, $1 sll $1, 29, $1
sll $2, 29, $2
ifmovd $1, $f1
ifmovd $2, $f2
vldd $f4, 0($3)
ldi $1, NC_STORE_THRESHOLD($31)
cmple $18, $1, $1
bne $1, $simd_u_loop
.align 5
$simd_u_loop_nc:
vldd $f5, 32($3)
fillcs 128 * 5($3)
srlow $f4, $f1, $f10
sllow $f5, $f2, $f11
vlogfc $f10, $f11, $f31, $f10
vldd $f4, 64($3)
srlow $f5, $f1, $f20
sllow $f4, $f2, $f21
vlogfc $f20, $f21, $f31, $f20
vstd_nc $f10, 0($16)
vstd_nc $f20, 32($16)
subl $18, 64, $18
addl $3, 64, $3
addl $16, 64, $16
bge $18, $simd_u_loop_nc
memb # required for _nc store instructions
br $31, $simd_u_loop_end
.align 5
$simd_u_loop:
vldd $f5, 32($3)
fillcs 128 * 5($3)
srlow $f4, $f1, $f10
sllow $f5, $f2, $f11
vlogfc $f10, $f11, $f31, $f10
vldd $f4, 64($3)
srlow $f5, $f1, $f20
sllow $f4, $f2, $f21
vlogfc $f20, $f21, $f31, $f20
vstd $f10, 0($16)
vstd $f20, 32($16)
subl $18, 64, $18
addl $3, 64, $3
addl $16, 64, $16
bge $18, $simd_u_loop
$simd_u_loop_end:
addl $18, 64, $1
cmplt $1, 32, $1
bne $1, $no_more_simd_u
vldd $f5, 32($3)
srlow $f4, $f1, $f10
sllow $f5, $f2, $f11
vlogfc $f10, $f11, $f31, $f10
vstd $f10, 0($16)
subl $18, 32, $18
addl $3, 32, $3
addl $16, 32, $16
$no_more_simd_u:
RESTORE_SIMD_U_REGS
bis $3, $5, $17
br $31, $simd_end
$prep_quad_u_loop_tail:
ldl_u $2, 0($17)
.align 5
$quad_u_loop_tail:
ldl_u $3, 8($17)
extll $2, $4, $22
exthl $3, $4, $23
bis $22, $23, $22
stl $22, 0($16)
ldl_u $2, 16($17)
extll $3, $4, $24
exthl $2, $4, $25
bis $24, $25, $24
stl $24, 8($16)
subl $18, 16, $18
addl $17, 16, $17
addl $16, 16, $16
bge $18, $quad_u_loop_tail
br $31, $quad_end
$move_one_quad_u:
ldl_u $2, 0($17)
ldl_u $3, 8($17)
subl $18, 8, $18 subl $18, 8, $18
addl $17, 8, $17 addl $17, 8, $17
fillde 128($4) extll $2, $4, $22
stl $1, 0($4) exthl $3, $4, $23
mov $16, $3 bis $22, $23, $22
stl $22, 0($16)
addl $4, 8, $4 addl $16, 8, $16
bge $18, $mis_quad ble $18, $out
br $31, $byte_loop_tail
$misalign_tail:
addl $18, 8, $18
ble $18, $nomoredata
$misalign_byte:
ldbu $1, 0($17)
subl $18, 1, $18
addl $17, 1, $17
stb $1, 0($4)
addl $4, 1, $4
bgt $18, $misalign_byte
$nomoredata:
ldi $4, 0x40($sp)
bic $4, 0x1f, $4
vldd $f4, 0($4)
vldd $f5, 0x20($4)
ldl $4, 0($sp)
addl $sp, 0xa0, $sp
ret $31, ($26), 1
.end memcpy .end memcpy
EXPORT_SYMBOL(memcpy) EXPORT_SYMBOL(memcpy)
__memcpy = memcpy __memcpy = memcpy
.globl __memcpy .globl __memcpy
...@@ -27,6 +27,8 @@ ...@@ -27,6 +27,8 @@
#include <asm/export.h> #include <asm/export.h>
#define NC_STORE_THRESHOLD 2048
.set noat .set noat
.set noreorder .set noreorder
.text .text
...@@ -57,6 +59,7 @@ __constant_c_memset: ...@@ -57,6 +59,7 @@ __constant_c_memset:
bne $5, $tail_loop bne $5, $tail_loop
/* loop until SRC is 8 bytes aligned */ /* loop until SRC is 8 bytes aligned */
.align 5
$head_loop: $head_loop:
and $16, 0x7, $1 and $16, 0x7, $1
beq $1, $mod8_aligned beq $1, $mod8_aligned
...@@ -69,6 +72,7 @@ $head_loop: ...@@ -69,6 +72,7 @@ $head_loop:
$mod8_aligned: $mod8_aligned:
/* set 8 bytes each time */ /* set 8 bytes each time */
.align 5
$mod8_loop: $mod8_loop:
and $16, 0x1f, $1 and $16, 0x1f, $1
beq $1, $mod32_aligned beq $1, $mod32_aligned
...@@ -87,23 +91,39 @@ $mod32_aligned: ...@@ -87,23 +91,39 @@ $mod32_aligned:
ifmovd $17, $f10 ifmovd $17, $f10
vcpyf $f10, $f10 vcpyf $f10, $f10
ldi $1, NC_STORE_THRESHOLD($31)
cmple $18, $1, $1
bne $1, $mod32_loop
/* set 64 bytes each time */ /* set 64 bytes each time */
$mod32_loop: .align 5
$mod32_loop_nc:
subl $18, 64, $18 subl $18, 64, $18
blt $18, $mod32_tail blt $18, $mod32_tail
vstd_nc $f10, 0($16) vstd_nc $f10, 0($16)
vstd_nc $f10, 32($16) vstd_nc $f10, 32($16)
addl $16, 64, $16 addl $16, 64, $16
br $31, $mod32_loop_nc
memb # required for _nc store instructions
.align 5
$mod32_loop:
subl $18, 64, $18
blt $18, $mod32_tail
vstd $f10, 0($16)
vstd $f10, 32($16)
addl $16, 64, $16
br $31, $mod32_loop br $31, $mod32_loop
$mod32_tail: $mod32_tail:
vldd $f10, 0($4) vldd $f10, 0($4)
addl $sp, 64, $sp addl $sp, 64, $sp
addl $18, 64, $18 addl $18, 64, $18
.align 5
$mod32_tail_loop: $mod32_tail_loop:
subl $18, 8, $18 subl $18, 8, $18
blt $18, $tail blt $18, $tail
stl_nc $17, 0($16) stl $17, 0($16)
addl $16, 8, $16 addl $16, 8, $16
br $31, $mod32_tail_loop br $31, $mod32_tail_loop
...@@ -111,6 +131,7 @@ $tail: ...@@ -111,6 +131,7 @@ $tail:
addl $18, 8, $18 addl $18, 8, $18
/* set one byte each time */ /* set one byte each time */
.align 5
$tail_loop: $tail_loop:
beq $18, $out beq $18, $out
stb $17, 0($16) stb $17, 0($16)
...@@ -120,7 +141,6 @@ $tail_loop: ...@@ -120,7 +141,6 @@ $tail_loop:
/* done, return */ /* done, return */
$out: $out:
memb # required for _nc store instructions
ret ret
.end ___memset .end ___memset
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册