提交 89aefedb 编写于 作者: M Mao Minkai 提交者: guzitao

sw64: add deep-set-template.S

Sunway inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I5XTLH

--------------------------------

Add deep-set-template.S to rewrite memset() and optimize
__clear_user().
Signed-off-by: NMao Minkai <maominkai@wxiat.com>
Reviewed-by: NHe Sheng <hesheng@wxiat.com>
Signed-off-by: NGu Zitao <guzitao@wxiat.com>
上级 d1878f16
...@@ -8,6 +8,13 @@ config DEEP_CLEAR_PAGE ...@@ -8,6 +8,13 @@ config DEEP_CLEAR_PAGE
This option enables the use of SIMD version of clear page routine. This option enables the use of SIMD version of clear page routine.
Say N if you want to use the generic version. Say N if you want to use the generic version.
config DEEP_CLEAR_USER
bool "Clear User with SIMD optimization"
default y
help
This option enables the use of SIMD version of clear user routine.
Say N if you want to use the generic version.
config DEEP_COPY_PAGE config DEEP_COPY_PAGE
bool "Copy Page with SIMD optimization" bool "Copy Page with SIMD optimization"
default y default y
......
...@@ -11,7 +11,6 @@ lib-y = __divlu.o __remlu.o __divwu.o __remwu.o \ ...@@ -11,7 +11,6 @@ lib-y = __divlu.o __remlu.o __divwu.o __remwu.o \
memmove.o \ memmove.o \
checksum.o \ checksum.o \
csum_partial_copy.o \ csum_partial_copy.o \
clear_user.o \
fpreg.o \ fpreg.o \
strcpy.o \ strcpy.o \
strncpy.o \ strncpy.o \
...@@ -21,6 +20,9 @@ lib-y = __divlu.o __remlu.o __divwu.o __remwu.o \ ...@@ -21,6 +20,9 @@ lib-y = __divlu.o __remlu.o __divwu.o __remwu.o \
lib-clear_page-y := clear_page.o lib-clear_page-y := clear_page.o
lib-clear_page-$(CONFIG_DEEP_CLEAR_PAGE) := deep-clear_page.o lib-clear_page-$(CONFIG_DEEP_CLEAR_PAGE) := deep-clear_page.o
lib-clear_user-y := clear_user.o
lib-clear_user-$(CONFIG_DEEP_CLEAR_USER) := deep-clear_user.o
lib-copy_page-y := copy_page.o lib-copy_page-y := copy_page.o
lib-copy_page-$(CONFIG_DEEP_COPY_PAGE) := deep-copy_page.o lib-copy_page-$(CONFIG_DEEP_COPY_PAGE) := deep-copy_page.o
...@@ -33,7 +35,7 @@ lib-memcpy-$(CONFIG_DEEP_MEMCPY) := deep-memcpy.o ...@@ -33,7 +35,7 @@ lib-memcpy-$(CONFIG_DEEP_MEMCPY) := deep-memcpy.o
lib-memset-y := memset.o lib-memset-y := memset.o
lib-memset-$(CONFIG_DEEP_MEMSET) := deep-memset.o lib-memset-$(CONFIG_DEEP_MEMSET) := deep-memset.o
lib-y += $(lib-clear_page-y) $(lib-copy_page-y) $(lib-copy_user-y) $(lib-memcpy-y) $(lib-memset-y) lib-y += $(lib-clear_page-y) $(lib-clear_user-y) $(lib-copy_page-y) $(lib-copy_user-y) $(lib-memcpy-y) $(lib-memset-y)
obj-y = iomap.o obj-y = iomap.o
obj-y += iomap_copy.o obj-y += iomap_copy.o
......
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Contributed by Mao Minkai <maominkai@wxiat.com>
*
* Zero user space, handling exceptions as we go.
*
* We have to make sure that $0 is always up-to-date and contains the
* right "bytes left to zero" value (and that it is updated only _after_
* a successful copy). There is also some rather minor exception setup
* stuff.
*
*/
#include <asm/export.h>
/* Allow an exception for an insn; exit if we get one. */
#define FIXUP_LDST(x,y...) \
99: x,##y; \
.section __ex_table,"a"; \
.long 99b - .; \
ldi $31, $out-99b($31); \
.previous
/*
* $7: SIMD status
* 0: not in simd loop
* 1: in simd loop
* 2: in simd_u loop
* $18: bytes left to copy
*
*/
.globl __clear_user
.ent __clear_user
__clear_user:
.prologue 0
bis $31, $31, $7
mov $17, $18
bis $31, $31, $17
#include "deep-set_template.S"
$out:
bis $31, $18, $0
beq $7, $return
$restore_simd:
RESTORE_SIMD_REGS
$return:
ret
.end __clear_user
EXPORT_SYMBOL(__clear_user)
...@@ -27,7 +27,8 @@ ...@@ -27,7 +27,8 @@
#include <asm/export.h> #include <asm/export.h>
#define NC_STORE_THRESHOLD 2048 #define FIXUP_LDST(x, y) \
x, y
.set noat .set noat
.set noreorder .set noreorder
...@@ -53,94 +54,9 @@ ___memset: ...@@ -53,94 +54,9 @@ ___memset:
bis $17, $4, $17 bis $17, $4, $17
__constant_c_memset: __constant_c_memset:
bis $31, $16, $0 # set return value bis $31, $31, $7
beq $18, $out # return if size is 0 bis $31, $16, $0
cmplt $18, 8, $5 # size less than 8, do 1-byte loop #include "deep-set_template.S"
bne $5, $tail_loop
/* loop until SRC is 8 bytes aligned */
.align 5
$head_loop:
and $16, 0x7, $1
beq $1, $mod8_aligned
stb $17, 0($16)
subl $18, 1, $18
beq $18, $out
addl $16, 1, $16
br $31, $head_loop
$mod8_aligned:
/* set 8 bytes each time */
.align 5
$mod8_loop:
and $16, 0x1f, $1
beq $1, $mod32_aligned
subl $18, 8, $18
blt $18, $tail
stl $17, 0($16)
addl $16, 8, $16
br $31, $mod8_loop
/* expand data to 32 bytes */
$mod32_aligned:
subl $sp, 64, $sp
addl $sp, 31, $4
bic $4, 0x1f, $4
vstd $f10, 0($4)
ifmovd $17, $f10
vcpyf $f10, $f10
ldi $1, NC_STORE_THRESHOLD($31)
cmple $18, $1, $1
bne $1, $mod32_loop
/* set 64 bytes each time */
.align 5
$mod32_loop_nc:
subl $18, 64, $18
blt $18, $mod32_tail_memb
vstd_nc $f10, 0($16)
vstd_nc $f10, 32($16)
addl $16, 64, $16
br $31, $mod32_loop_nc
.align 5
$mod32_loop:
subl $18, 64, $18
blt $18, $mod32_tail
vstd $f10, 0($16)
vstd $f10, 32($16)
addl $16, 64, $16
br $31, $mod32_loop
$mod32_tail_memb:
memb # required for _nc store instructions
$mod32_tail:
vldd $f10, 0($4)
addl $sp, 64, $sp
addl $18, 64, $18
.align 5
$mod32_tail_loop:
subl $18, 8, $18
blt $18, $tail
stl $17, 0($16)
addl $16, 8, $16
br $31, $mod32_tail_loop
$tail:
addl $18, 8, $18
/* set one byte each time */
.align 5
$tail_loop:
beq $18, $out
stb $17, 0($16)
subl $18, 1, $18
addl $16, 1, $16
br $31, $tail_loop
/* done, return */
$out: $out:
ret ret
......
/* SPDX-License-Identifier: GPL-2.0 */
/*
* template for memcpy and copy_user with SIMD
*
* $7: SIMD status
* 0: not in simd loop
* 1: in simd loop
* 2: in simd_u loop
* $16: latest dest, clobbered
* $17: 8-byte data to set
* $18: bytes left to copy
*
*/
#define NC_STORE_THRESHOLD 2048
#define SAVE_SIMD_REGS \
ldi $sp, -0x40($sp); \
addl $sp, 0x1f, $23; \
bic $23, 0x1f, $23; \
vstd $f1, 0($23); \
ldi $7, 1
#define RESTORE_SIMD_REGS \
vldd $f1, 0($23); \
ldi $sp, 0x40($sp); \
bis $31, $31, $7
ble $18, $out
and $16, 7, $1
beq $1, $dest_aligned_8
.align 3
$byte_loop_head:
FIXUP_LDST( stb $17, 0($16) )
subl $18, 1, $18
addl $16, 1, $16
ble $18, $out
and $16, 7, $1
bne $1, $byte_loop_head
$dest_aligned_8:
cmplt $18, 16, $1
bne $1, $quad_loop_end
and $16, 31, $1
beq $1, $dest_aligned_32
cmplt $18, 64, $1
bne $1, $simd_end
.align 3
$quad_loop_head:
FIXUP_LDST( stl $17, 0($16) )
addl $16, 8, $16
subl $18, 8, $18
and $16, 31, $1
beq $1, $dest_aligned_32
br $31, $quad_loop_head
$dest_aligned_32:
cmplt $18, 64, $1
bne $1, $simd_end
$prep_simd_loop:
SAVE_SIMD_REGS
ifmovd $17, $f1
vcpyf $f1, $f1
ldi $1, NC_STORE_THRESHOLD($31)
cmple $18, $1, $1
bne $1, $simd_loop
.align 3
$simd_loop_nc:
FIXUP_LDST( vstd_nc $f1, 0($16) )
FIXUP_LDST( vstd_nc $f1, 32($16) )
subl $18, 64, $18
addl $16, 64, $16
cmplt $18, 64, $1
beq $1, $simd_loop_nc
memb # required for _nc store instructions
br $31, $simd_loop_end
.align 3
$simd_loop:
FIXUP_LDST( vstd $f1, 0($16) )
FIXUP_LDST( vstd $f1, 32($16) )
subl $18, 64, $18
addl $16, 64, $16
cmplt $18, 64, $1
beq $1, $simd_loop
$simd_loop_end:
cmplt $18, 32, $1
bne $1, $no_more_simd
FIXUP_LDST( vstd $f1, 0($16) )
subl $18, 32, $18
addl $16, 32, $16
$no_more_simd:
RESTORE_SIMD_REGS
$simd_end:
ble $18, $out
cmplt $18, 16, $1
bne $1, $quad_loop_end
.align 3
$quad_loop_tail:
FIXUP_LDST( stl $17, 0($16) )
FIXUP_LDST( stl $17, 8($16) )
subl $18, 16, $18
addl $16, 16, $16
cmplt $18, 16, $1
beq $1, $quad_loop_tail
$quad_loop_end:
ble $18, $out
cmplt $18, 8, $1
bne $1, $byte_loop_tail
$move_one_quad:
FIXUP_LDST( stl $17, 0($16) )
subl $18, 8, $18
addl $16, 8, $16
ble $18, $out
.align 3
$byte_loop_tail:
FIXUP_LDST( stb $17, 0($16) )
subl $18, 1, $18
addl $16, 1, $16
bgt $18, $byte_loop_tail
br $31, $out
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册