提交 e6896efc 编写于 作者: M Mao Minkai 提交者: guzitao

sw64: adjust instructions order of deep-copy_template.S

Sunway inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I5PN9S

--------------------------------

Adjust order of instructions in deep-copy_template.S to make sure $18
always has bytes left to copy. This makes sure the return value of
copy_{to,from}_user() is correct.
Signed-off-by: NMao Minkai <maominkai@wxiat.com>
Signed-off-by: NGu Zitao <guzitao@wxiat.com>
上级 3fce9bf6
...@@ -3,9 +3,11 @@ ...@@ -3,9 +3,11 @@
/* /*
* template for memcpy and copy_user with SIMD * template for memcpy and copy_user with SIMD
* *
* $16: current store address * $4: 8-byte misalignment of src when dest is 8-byte aligned
* $17: current load address * $5: 32-byte misalignment of src when dest is 32-byte aligned
* $18: current bytes left to copy * $16: latest dest, clobbered
* $17: latest src, clobbered
* $18: bytes left to copy
* *
*/ */
...@@ -26,17 +28,14 @@ ...@@ -26,17 +28,14 @@
ldi $sp, 0x60($sp) ldi $sp, 0x60($sp)
#define SAVE_SIMD_U_REGS \ #define SAVE_SIMD_U_REGS \
ldi $sp, -0x120($sp); \ ldi $sp, -0xc0($sp); \
addl $sp, 0x1f, $23; \ addl $sp, 0x1f, $23; \
bic $23, 0x1f, $23; \ bic $23, 0x1f, $23; \
vstd $f1, 0($23); \ vstd $f1, 0($23); \
vstd $f2, 0x20($23); \ vstd $f2, 0x20($23); \
vstd $f4, 0x40($23); \ vstd $f4, 0x40($23); \
vstd $f5, 0x60($23); \ vstd $f5, 0x60($23); \
vstd $f10, 0x80($23); \ vstd $f3, 0x80($23)
vstd $f11, 0xa0($23); \
vstd $f20, 0xc0($23); \
vstd $f21, 0xe0($23)
#define RESTORE_SIMD_U_REGS \ #define RESTORE_SIMD_U_REGS \
addl $sp, 0x1f, $23; \ addl $sp, 0x1f, $23; \
...@@ -45,22 +44,19 @@ ...@@ -45,22 +44,19 @@
vldd $f2, 0x20($23); \ vldd $f2, 0x20($23); \
vldd $f4, 0x40($23); \ vldd $f4, 0x40($23); \
vldd $f5, 0x60($23); \ vldd $f5, 0x60($23); \
vldd $f10, 0x80($23); \ vldd $f3, 0x80($23); \
vldd $f11, 0xa0($23); \ ldi $sp, 0xc0($sp)
vldd $f20, 0xc0($23); \
vldd $f21, 0xe0($23); \
ldi $sp, 0x120($sp)
ble $18, $out ble $18, $out
and $16, 7, $1 and $16, 7, $1
beq $1, $dest_aligned_8 beq $1, $dest_aligned_8
.align 4 .align 3
$byte_loop_head: $byte_loop_head:
FIXUP_LDST( ldbu $2, 0($17) ) FIXUP_LDST( ldbu $2, 0($17) )
FIXUP_LDST( stb $2, 0($16) )
subl $18, 1, $18 subl $18, 1, $18
addl $17, 1, $17 addl $17, 1, $17
FIXUP_LDST( stb $2, 0($16) )
addl $16, 1, $16 addl $16, 1, $16
ble $18, $out ble $18, $out
and $16, 7, $1 and $16, 7, $1
...@@ -68,27 +64,28 @@ $byte_loop_head: ...@@ -68,27 +64,28 @@ $byte_loop_head:
$dest_aligned_8: $dest_aligned_8:
and $17, 7, $4 and $17, 7, $4
subl $18, 16, $18 cmplt $18, 16, $1
blt $18, $quad_end bne $1, $quad_loop_end
subl $18, 64, $18
blt $18, $simd_end
and $16, 31, $1 and $16, 31, $1
beq $1, $dest_aligned_32 beq $1, $dest_aligned_32
cmplt $18, 64, $1
bne $1, $simd_end
bne $4, $quad_u_loop_head bne $4, $quad_u_loop_head
.align 5 .align 3
$quad_loop_head: $quad_loop_head:
FIXUP_LDST( ldl $2, 0($17) ) FIXUP_LDST( ldl $2, 0($17) )
subl $18, 8, $18
addl $17, 8, $17
FIXUP_LDST( stl $2, 0($16) ) FIXUP_LDST( stl $2, 0($16) )
addl $16, 8, $16 addl $16, 8, $16
blt $18, $simd_end addl $17, 8, $17
subl $18, 8, $18
and $16, 31, $1 and $16, 31, $1
beq $1, $dest_aligned_32 beq $1, $dest_aligned_32
br $31, $quad_loop_head br $31, $quad_loop_head
$dest_aligned_32: $dest_aligned_32:
cmplt $18, 64, $1
bne $1, $simd_end
and $17, 31, $5 and $17, 31, $5
bne $5, $prep_simd_u_loop bne $5, $prep_simd_u_loop
...@@ -98,63 +95,65 @@ $prep_simd_loop: ...@@ -98,63 +95,65 @@ $prep_simd_loop:
cmple $18, $1, $1 cmple $18, $1, $1
bne $1, $simd_loop bne $1, $simd_loop
.align 5 .align 4
$simd_loop_nc: $simd_loop_nc:
fillcs 128 * 5($17) fillcs 128 * 5($17)
FIXUP_LDST( vldd $f1, 0($17) ) FIXUP_LDST( vldd $f1, 0($17) )
FIXUP_LDST( vldd $f2, 32($17) ) FIXUP_LDST( vldd $f2, 32($17) )
subl $18, 64, $18
addl $17, 64, $17
FIXUP_LDST( vstd_nc $f1, 0($16) ) FIXUP_LDST( vstd_nc $f1, 0($16) )
FIXUP_LDST( vstd_nc $f2, 32($16) ) FIXUP_LDST( vstd_nc $f2, 32($16) )
subl $18, 64, $18
addl $17, 64, $17
addl $16, 64, $16 addl $16, 64, $16
bge $18, $simd_loop_nc cmplt $18, 64, $1
beq $1, $simd_loop_nc
memb # required for _nc store instructions memb # required for _nc store instructions
br $31, $simd_loop_end br $31, $simd_loop_end
.align 5 .align 4
$simd_loop: $simd_loop:
fillcs 128 * 5($17) fillcs 128 * 5($17)
FIXUP_LDST( vldd $f1, 0($17) ) FIXUP_LDST( vldd $f1, 0($17) )
FIXUP_LDST( vldd $f2, 32($17) ) FIXUP_LDST( vldd $f2, 32($17) )
subl $18, 64, $18
addl $17, 64, $17
FIXUP_LDST( vstd $f1, 0($16) ) FIXUP_LDST( vstd $f1, 0($16) )
FIXUP_LDST( vstd $f2, 32($16) ) FIXUP_LDST( vstd $f2, 32($16) )
subl $18, 64, $18
addl $17, 64, $17
addl $16, 64, $16 addl $16, 64, $16
bge $18, $simd_loop cmplt $18, 64, $1
beq $1, $simd_loop
$simd_loop_end: $simd_loop_end:
addl $18, 64, $1 cmplt $18, 32, $1
cmplt $1, 32, $1
bne $1, $no_more_simd bne $1, $no_more_simd
FIXUP_LDST( vldd $f1, 0($17) ) FIXUP_LDST( vldd $f1, 0($17) )
FIXUP_LDST( vstd $f1, 0($16) )
subl $18, 32, $18 subl $18, 32, $18
addl $17, 32, $17 addl $17, 32, $17
FIXUP_LDST( vstd $f1, 0($16) )
addl $16, 32, $16 addl $16, 32, $16
$no_more_simd: $no_more_simd:
RESTORE_SIMD_REGS RESTORE_SIMD_REGS
$simd_end: $simd_end:
addl $18, 64, $18 ble $18, $out
blt $18, $quad_end cmplt $18, 16, $1
bne $1, $quad_loop_end
bne $4, $prep_quad_u_loop_tail bne $4, $prep_quad_u_loop_tail
.align 4 .align 4
$quad_loop_tail: $quad_loop_tail:
FIXUP_LDST( ldl $2, 0($17) ) FIXUP_LDST( ldl $2, 0($17) )
FIXUP_LDST( ldl $3, 8($17) ) FIXUP_LDST( ldl $3, 8($17) )
subl $18, 16, $18
addl $17, 16, $17
FIXUP_LDST( stl $2, 0($16) ) FIXUP_LDST( stl $2, 0($16) )
FIXUP_LDST( stl $3, 8($16) ) FIXUP_LDST( stl $3, 8($16) )
subl $18, 16, $18
addl $17, 16, $17
addl $16, 16, $16 addl $16, 16, $16
bge $18, $quad_loop_tail cmplt $18, 16, $1
beq $1, $quad_loop_tail
$quad_end: $quad_loop_end:
addl $18, 16, $18
ble $18, $out ble $18, $out
cmplt $18, 8, $1 cmplt $18, 8, $1
bne $1, $byte_loop_tail bne $1, $byte_loop_tail
...@@ -162,35 +161,34 @@ $quad_end: ...@@ -162,35 +161,34 @@ $quad_end:
$move_one_quad: $move_one_quad:
FIXUP_LDST( ldl $2, 0($17) ) FIXUP_LDST( ldl $2, 0($17) )
FIXUP_LDST( stl $2, 0($16) )
subl $18, 8, $18 subl $18, 8, $18
addl $17, 8, $17 addl $17, 8, $17
FIXUP_LDST( stl $2, 0($16) )
addl $16, 8, $16 addl $16, 8, $16
ble $18, $out ble $18, $out
.align 4 .align 3
$byte_loop_tail: $byte_loop_tail:
FIXUP_LDST( ldbu $2, 0($17) ) FIXUP_LDST( ldbu $2, 0($17) )
FIXUP_LDST( stb $2, 0($16) )
subl $18, 1, $18 subl $18, 1, $18
addl $17, 1, $17 addl $17, 1, $17
FIXUP_LDST( stb $2, 0($16) )
addl $16, 1, $16 addl $16, 1, $16
bgt $18, $byte_loop_tail bgt $18, $byte_loop_tail
br $31, $out br $31, $out
/* misaligned src and dst */ /* misaligned src and dst */
.align 5 .align 4
$quad_u_loop_head: $quad_u_loop_head:
FIXUP_LDST( ldl_u $2, 0($17) ) FIXUP_LDST( ldl_u $2, 0($17) )
FIXUP_LDST( ldl_u $3, 7($17) ) FIXUP_LDST( ldl_u $3, 7($17) )
subl $18, 8, $18
addl $17, 8, $17
extll $2, $4, $2 extll $2, $4, $2
exthl $3, $4, $3 exthl $3, $4, $3
bis $2, $3, $2 bis $2, $3, $2
FIXUP_LDST( stl $2, 0($16) ) FIXUP_LDST( stl $2, 0($16) )
addl $16, 8, $16 addl $16, 8, $16
blt $18, $simd_end addl $17, 8, $17
subl $18, 8, $18
and $16, 31, $1 and $16, 31, $1
beq $1, $dest_aligned_32 beq $1, $dest_aligned_32
br $31, $quad_u_loop_head br $31, $quad_u_loop_head
...@@ -210,53 +208,54 @@ $prep_simd_u_loop: ...@@ -210,53 +208,54 @@ $prep_simd_u_loop:
cmple $18, $1, $1 cmple $18, $1, $1
bne $1, $simd_u_loop bne $1, $simd_u_loop
.align 5 .align 4
$simd_u_loop_nc: $simd_u_loop_nc:
FIXUP_LDST( vldd $f5, 32($3) ) FIXUP_LDST( vldd $f5, 32($3) )
fillcs 128 * 5($3) fillcs 128 * 5($3)
srlow $f4, $f1, $f10 srlow $f4, $f1, $f4
sllow $f5, $f2, $f11 sllow $f5, $f2, $f3
vlogfc $f10, $f11, $f31, $f10 vlogfc $f3, $f4, $f31, $f3
FIXUP_LDST( vstd_nc $f3, 0($16) )
FIXUP_LDST( vldd $f4, 64($3) ) FIXUP_LDST( vldd $f4, 64($3) )
srlow $f5, $f1, $f20 srlow $f5, $f1, $f5
sllow $f4, $f2, $f21 sllow $f4, $f2, $f3
vlogfc $f20, $f21, $f31, $f20 vlogfc $f5, $f3, $f31, $f5
FIXUP_LDST( vstd_nc $f10, 0($16) ) FIXUP_LDST( vstd_nc $f5, 32($16) )
FIXUP_LDST( vstd_nc $f20, 32($16) )
subl $18, 64, $18 subl $18, 64, $18
addl $3, 64, $3 addl $3, 64, $3
addl $16, 64, $16 addl $16, 64, $16
bge $18, $simd_u_loop_nc cmplt $18, 64, $1
beq $1, $simd_u_loop_nc
memb # required for _nc store instructions memb # required for _nc store instructions
br $31, $simd_u_loop_end br $31, $simd_u_loop_end
.align 5 .align 4
$simd_u_loop: $simd_u_loop:
FIXUP_LDST( vldd $f5, 32($3) ) FIXUP_LDST( vldd $f5, 32($3) )
fillcs 128 * 5($3) fillcs 128 * 5($3)
srlow $f4, $f1, $f10 srlow $f4, $f1, $f4
sllow $f5, $f2, $f11 sllow $f5, $f2, $f3
vlogfc $f10, $f11, $f31, $f10 vlogfc $f4, $f3, $f31, $f3
FIXUP_LDST( vstd $f3, 0($16) )
FIXUP_LDST( vldd $f4, 64($3) ) FIXUP_LDST( vldd $f4, 64($3) )
srlow $f5, $f1, $f20 srlow $f5, $f1, $f5
sllow $f4, $f2, $f21 sllow $f4, $f2, $f3
vlogfc $f20, $f21, $f31, $f20 vlogfc $f5, $f3, $f31, $f3
FIXUP_LDST( vstd $f10, 0($16) ) FIXUP_LDST( vstd $f3, 32($16) )
FIXUP_LDST( vstd $f20, 32($16) )
subl $18, 64, $18 subl $18, 64, $18
addl $3, 64, $3 addl $3, 64, $3
addl $16, 64, $16 addl $16, 64, $16
bge $18, $simd_u_loop cmplt $18, 64, $1
beq $1, $simd_u_loop
$simd_u_loop_end: $simd_u_loop_end:
addl $18, 64, $1 cmplt $18, 32, $1
cmplt $1, 32, $1
bne $1, $no_more_simd_u bne $1, $no_more_simd_u
FIXUP_LDST( vldd $f5, 32($3) ) FIXUP_LDST( vldd $f5, 32($3) )
srlow $f4, $f1, $f10 srlow $f4, $f1, $f4
sllow $f5, $f2, $f11 sllow $f5, $f2, $f3
vlogfc $f10, $f11, $f31, $f10 vlogfc $f4, $f3, $f31, $f3
FIXUP_LDST( vstd $f10, 0($16) ) FIXUP_LDST( vstd $f3, 0($16) )
subl $18, 32, $18 subl $18, 32, $18
addl $3, 32, $3 addl $3, 32, $3
addl $16, 32, $16 addl $16, 32, $16
...@@ -268,7 +267,7 @@ $no_more_simd_u: ...@@ -268,7 +267,7 @@ $no_more_simd_u:
$prep_quad_u_loop_tail: $prep_quad_u_loop_tail:
FIXUP_LDST( ldl_u $2, 0($17) ) FIXUP_LDST( ldl_u $2, 0($17) )
.align 5 .align 4
$quad_u_loop_tail: $quad_u_loop_tail:
FIXUP_LDST( ldl_u $3, 8($17) ) FIXUP_LDST( ldl_u $3, 8($17) )
extll $2, $4, $22 extll $2, $4, $22
...@@ -283,18 +282,19 @@ $quad_u_loop_tail: ...@@ -283,18 +282,19 @@ $quad_u_loop_tail:
subl $18, 16, $18 subl $18, 16, $18
addl $17, 16, $17 addl $17, 16, $17
addl $16, 16, $16 addl $16, 16, $16
bge $18, $quad_u_loop_tail cmplt $18, 16, $1
br $31, $quad_end beq $1, $quad_u_loop_tail
br $31, $quad_loop_end
$move_one_quad_u: $move_one_quad_u:
FIXUP_LDST( ldl_u $2, 0($17) ) FIXUP_LDST( ldl_u $2, 0($17) )
FIXUP_LDST( ldl_u $3, 8($17) ) FIXUP_LDST( ldl_u $3, 8($17) )
subl $18, 8, $18
addl $17, 8, $17
extll $2, $4, $22 extll $2, $4, $22
exthl $3, $4, $23 exthl $3, $4, $23
bis $22, $23, $22 bis $22, $23, $22
FIXUP_LDST( stl $22, 0($16) ) FIXUP_LDST( stl $22, 0($16) )
subl $18, 8, $18
addl $17, 8, $17
addl $16, 8, $16 addl $16, 8, $16
ble $18, $out ble $18, $out
br $31, $byte_loop_tail br $31, $byte_loop_tail
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册