diff --git a/arch/sw_64/lib/deep-copy_template.S b/arch/sw_64/lib/deep-copy_template.S index f80be2bef44c4ca1395ed7e1c0602448728b0abc..aa6ab8e29f057576ddcc922d31847314933f66c2 100644 --- a/arch/sw_64/lib/deep-copy_template.S +++ b/arch/sw_64/lib/deep-copy_template.S @@ -3,9 +3,11 @@ /* * template for memcpy and copy_user with SIMD * - * $16: current store address - * $17: current load address - * $18: current bytes left to copy + * $4: 8-byte misalignment of src when dest is 8-byte aligned + * $5: 32-byte misalignment of src when dest is 32-byte aligned + * $16: latest dest, clobbered + * $17: latest src, clobbered + * $18: bytes left to copy * */ @@ -26,17 +28,14 @@ ldi $sp, 0x60($sp) #define SAVE_SIMD_U_REGS \ - ldi $sp, -0x120($sp); \ + ldi $sp, -0xc0($sp); \ addl $sp, 0x1f, $23; \ bic $23, 0x1f, $23; \ vstd $f1, 0($23); \ vstd $f2, 0x20($23); \ vstd $f4, 0x40($23); \ vstd $f5, 0x60($23); \ - vstd $f10, 0x80($23); \ - vstd $f11, 0xa0($23); \ - vstd $f20, 0xc0($23); \ - vstd $f21, 0xe0($23) + vstd $f3, 0x80($23) #define RESTORE_SIMD_U_REGS \ addl $sp, 0x1f, $23; \ @@ -45,22 +44,19 @@ vldd $f2, 0x20($23); \ vldd $f4, 0x40($23); \ vldd $f5, 0x60($23); \ - vldd $f10, 0x80($23); \ - vldd $f11, 0xa0($23); \ - vldd $f20, 0xc0($23); \ - vldd $f21, 0xe0($23); \ - ldi $sp, 0x120($sp) + vldd $f3, 0x80($23); \ + ldi $sp, 0xc0($sp) ble $18, $out and $16, 7, $1 beq $1, $dest_aligned_8 - .align 4 + .align 3 $byte_loop_head: FIXUP_LDST( ldbu $2, 0($17) ) + FIXUP_LDST( stb $2, 0($16) ) subl $18, 1, $18 addl $17, 1, $17 - FIXUP_LDST( stb $2, 0($16) ) addl $16, 1, $16 ble $18, $out and $16, 7, $1 @@ -68,27 +64,28 @@ $byte_loop_head: $dest_aligned_8: and $17, 7, $4 - subl $18, 16, $18 - blt $18, $quad_end - subl $18, 64, $18 - blt $18, $simd_end + cmplt $18, 16, $1 + bne $1, $quad_loop_end and $16, 31, $1 beq $1, $dest_aligned_32 + cmplt $18, 64, $1 + bne $1, $simd_end bne $4, $quad_u_loop_head - .align 5 + .align 3 $quad_loop_head: FIXUP_LDST( ldl $2, 0($17) ) - subl $18, 8, $18 - addl $17, 8, $17 FIXUP_LDST( stl $2, 0($16) ) addl $16, 8, $16 - blt $18, $simd_end + addl $17, 8, $17 + subl $18, 8, $18 and $16, 31, $1 beq $1, $dest_aligned_32 br $31, $quad_loop_head $dest_aligned_32: + cmplt $18, 64, $1 + bne $1, $simd_end and $17, 31, $5 bne $5, $prep_simd_u_loop @@ -98,63 +95,65 @@ $prep_simd_loop: cmple $18, $1, $1 bne $1, $simd_loop - .align 5 + .align 4 $simd_loop_nc: fillcs 128 * 5($17) FIXUP_LDST( vldd $f1, 0($17) ) FIXUP_LDST( vldd $f2, 32($17) ) - subl $18, 64, $18 - addl $17, 64, $17 FIXUP_LDST( vstd_nc $f1, 0($16) ) FIXUP_LDST( vstd_nc $f2, 32($16) ) + subl $18, 64, $18 + addl $17, 64, $17 addl $16, 64, $16 - bge $18, $simd_loop_nc + cmplt $18, 64, $1 + beq $1, $simd_loop_nc memb # required for _nc store instructions br $31, $simd_loop_end - .align 5 + .align 4 $simd_loop: fillcs 128 * 5($17) FIXUP_LDST( vldd $f1, 0($17) ) FIXUP_LDST( vldd $f2, 32($17) ) - subl $18, 64, $18 - addl $17, 64, $17 FIXUP_LDST( vstd $f1, 0($16) ) FIXUP_LDST( vstd $f2, 32($16) ) + subl $18, 64, $18 + addl $17, 64, $17 addl $16, 64, $16 - bge $18, $simd_loop + cmplt $18, 64, $1 + beq $1, $simd_loop $simd_loop_end: - addl $18, 64, $1 - cmplt $1, 32, $1 + cmplt $18, 32, $1 bne $1, $no_more_simd FIXUP_LDST( vldd $f1, 0($17) ) + FIXUP_LDST( vstd $f1, 0($16) ) subl $18, 32, $18 addl $17, 32, $17 - FIXUP_LDST( vstd $f1, 0($16) ) addl $16, 32, $16 $no_more_simd: RESTORE_SIMD_REGS $simd_end: - addl $18, 64, $18 - blt $18, $quad_end + ble $18, $out + cmplt $18, 16, $1 + bne $1, $quad_loop_end bne $4, $prep_quad_u_loop_tail .align 4 $quad_loop_tail: FIXUP_LDST( ldl $2, 0($17) ) FIXUP_LDST( ldl $3, 8($17) ) - subl $18, 16, $18 - addl $17, 16, $17 FIXUP_LDST( stl $2, 0($16) ) FIXUP_LDST( stl $3, 8($16) ) + subl $18, 16, $18 + addl $17, 16, $17 addl $16, 16, $16 - bge $18, $quad_loop_tail + cmplt $18, 16, $1 + beq $1, $quad_loop_tail -$quad_end: - addl $18, 16, $18 +$quad_loop_end: ble $18, $out cmplt $18, 8, $1 bne $1, $byte_loop_tail @@ -162,35 +161,34 @@ $quad_end: $move_one_quad: FIXUP_LDST( ldl $2, 0($17) ) + FIXUP_LDST( stl $2, 0($16) ) subl $18, 8, $18 addl $17, 8, $17 - FIXUP_LDST( stl $2, 0($16) ) addl $16, 8, $16 ble $18, $out - .align 4 + .align 3 $byte_loop_tail: FIXUP_LDST( ldbu $2, 0($17) ) + FIXUP_LDST( stb $2, 0($16) ) subl $18, 1, $18 addl $17, 1, $17 - FIXUP_LDST( stb $2, 0($16) ) addl $16, 1, $16 bgt $18, $byte_loop_tail br $31, $out /* misaligned src and dst */ - .align 5 + .align 4 $quad_u_loop_head: FIXUP_LDST( ldl_u $2, 0($17) ) FIXUP_LDST( ldl_u $3, 7($17) ) - subl $18, 8, $18 - addl $17, 8, $17 extll $2, $4, $2 exthl $3, $4, $3 bis $2, $3, $2 FIXUP_LDST( stl $2, 0($16) ) addl $16, 8, $16 - blt $18, $simd_end + addl $17, 8, $17 + subl $18, 8, $18 and $16, 31, $1 beq $1, $dest_aligned_32 br $31, $quad_u_loop_head @@ -210,53 +208,54 @@ $prep_simd_u_loop: cmple $18, $1, $1 bne $1, $simd_u_loop - .align 5 + .align 4 $simd_u_loop_nc: FIXUP_LDST( vldd $f5, 32($3) ) fillcs 128 * 5($3) - srlow $f4, $f1, $f10 - sllow $f5, $f2, $f11 - vlogfc $f10, $f11, $f31, $f10 + srlow $f4, $f1, $f4 + sllow $f5, $f2, $f3 + vlogfc $f3, $f4, $f31, $f3 + FIXUP_LDST( vstd_nc $f3, 0($16) ) FIXUP_LDST( vldd $f4, 64($3) ) - srlow $f5, $f1, $f20 - sllow $f4, $f2, $f21 - vlogfc $f20, $f21, $f31, $f20 - FIXUP_LDST( vstd_nc $f10, 0($16) ) - FIXUP_LDST( vstd_nc $f20, 32($16) ) + srlow $f5, $f1, $f5 + sllow $f4, $f2, $f3 + vlogfc $f5, $f3, $f31, $f5 + FIXUP_LDST( vstd_nc $f5, 32($16) ) subl $18, 64, $18 addl $3, 64, $3 addl $16, 64, $16 - bge $18, $simd_u_loop_nc + cmplt $18, 64, $1 + beq $1, $simd_u_loop_nc memb # required for _nc store instructions br $31, $simd_u_loop_end - .align 5 + .align 4 $simd_u_loop: FIXUP_LDST( vldd $f5, 32($3) ) fillcs 128 * 5($3) - srlow $f4, $f1, $f10 - sllow $f5, $f2, $f11 - vlogfc $f10, $f11, $f31, $f10 + srlow $f4, $f1, $f4 + sllow $f5, $f2, $f3 + vlogfc $f4, $f3, $f31, $f3 + FIXUP_LDST( vstd $f3, 0($16) ) FIXUP_LDST( vldd $f4, 64($3) ) - srlow $f5, $f1, $f20 - sllow $f4, $f2, $f21 - vlogfc $f20, $f21, $f31, $f20 - FIXUP_LDST( vstd $f10, 0($16) ) - FIXUP_LDST( vstd $f20, 32($16) ) + srlow $f5, $f1, $f5 + sllow $f4, $f2, $f3 + vlogfc $f5, $f3, $f31, $f3 + FIXUP_LDST( vstd $f3, 32($16) ) subl $18, 64, $18 addl $3, 64, $3 addl $16, 64, $16 - bge $18, $simd_u_loop + cmplt $18, 64, $1 + beq $1, $simd_u_loop $simd_u_loop_end: - addl $18, 64, $1 - cmplt $1, 32, $1 + cmplt $18, 32, $1 bne $1, $no_more_simd_u FIXUP_LDST( vldd $f5, 32($3) ) - srlow $f4, $f1, $f10 - sllow $f5, $f2, $f11 - vlogfc $f10, $f11, $f31, $f10 - FIXUP_LDST( vstd $f10, 0($16) ) + srlow $f4, $f1, $f4 + sllow $f5, $f2, $f3 + vlogfc $f4, $f3, $f31, $f3 + FIXUP_LDST( vstd $f3, 0($16) ) subl $18, 32, $18 addl $3, 32, $3 addl $16, 32, $16 @@ -268,7 +267,7 @@ $no_more_simd_u: $prep_quad_u_loop_tail: FIXUP_LDST( ldl_u $2, 0($17) ) - .align 5 + .align 4 $quad_u_loop_tail: FIXUP_LDST( ldl_u $3, 8($17) ) extll $2, $4, $22 @@ -283,18 +282,19 @@ $quad_u_loop_tail: subl $18, 16, $18 addl $17, 16, $17 addl $16, 16, $16 - bge $18, $quad_u_loop_tail - br $31, $quad_end + cmplt $18, 16, $1 + beq $1, $quad_u_loop_tail + br $31, $quad_loop_end $move_one_quad_u: FIXUP_LDST( ldl_u $2, 0($17) ) FIXUP_LDST( ldl_u $3, 8($17) ) - subl $18, 8, $18 - addl $17, 8, $17 extll $2, $4, $22 exthl $3, $4, $23 bis $22, $23, $22 FIXUP_LDST( stl $22, 0($16) ) + subl $18, 8, $18 + addl $17, 8, $17 addl $16, 8, $16 ble $18, $out br $31, $byte_loop_tail