csum-copy.S 3.7 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
/*
 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
 *	
 * This file is subject to the terms and conditions of the GNU General Public
 * License.  See the file COPYING in the main directory of this archive
 * for more details. No warranty for anything given at all.
 */
 	#include <linux/linkage.h>
	#include <asm/errno.h>

/*
 * Checksum copy with exception handling.
 * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the 
 * destination is zeroed.
 * 
 * Input
 * rdi  source
 * rsi  destination
 * edx  len (32bit)
 * ecx  sum (32bit) 
 * r8   src_err_ptr (int)
 * r9   dst_err_ptr (int)
 *
 * Output
 * eax  64bit sum. undefined in case of exception.
 * 
 * Wrappers need to take care of valid exception sum and zeroing.		 
 * They also should align source or destination to 8 bytes.
 */

	.macro source
10:
	.section __ex_table,"a"
	.align 8
	.quad 10b,.Lbad_source
	.previous
	.endm
		
	.macro dest
20:
	.section __ex_table,"a"
	.align 8
	.quad 20b,.Lbad_dest
	.previous
	.endm
			
	.macro ignore L=.Lignore
30:
	.section __ex_table,"a"
	.align 8
	.quad 30b,\L
	.previous
	.endm
	
				
	.globl csum_partial_copy_generic
	.p2align 4
csum_partial_copy_generic:
	cmpl	 $3*64,%edx
	jle	 .Lignore

.Lignore:		
	subq  $7*8,%rsp
	movq  %rbx,2*8(%rsp)
	movq  %r12,3*8(%rsp)
	movq  %r14,4*8(%rsp)
	movq  %r13,5*8(%rsp)
	movq  %rbp,6*8(%rsp)

	movq  %r8,(%rsp)
	movq  %r9,1*8(%rsp)
	
	movl  %ecx,%eax
	movl  %edx,%ecx

	xorl  %r9d,%r9d
	movq  %rcx,%r12

	shrq  $6,%r12
	jz    .Lhandle_tail       /* < 64 */

	clc
	
	/* main loop. clear in 64 byte blocks */
	/* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
	/* r11:	temp3, rdx: temp4, r12 loopcnt */
	/* r10:	temp5, rbp: temp6, r14 temp7, r13 temp8 */
	.p2align 4
.Lloop:
	source
	movq  (%rdi),%rbx
	source
	movq  8(%rdi),%r8
	source
	movq  16(%rdi),%r11
	source
	movq  24(%rdi),%rdx

	source
	movq  32(%rdi),%r10
	source
	movq  40(%rdi),%rbp
	source
	movq  48(%rdi),%r14
	source
	movq  56(%rdi),%r13
		
	ignore 2f
	prefetcht0 5*64(%rdi)
2:							
	adcq  %rbx,%rax
	adcq  %r8,%rax
	adcq  %r11,%rax
	adcq  %rdx,%rax
	adcq  %r10,%rax
	adcq  %rbp,%rax
	adcq  %r14,%rax
	adcq  %r13,%rax

	decl %r12d
	
	dest
	movq %rbx,(%rsi)
	dest
	movq %r8,8(%rsi)
	dest
	movq %r11,16(%rsi)
	dest
	movq %rdx,24(%rsi)

	dest
	movq %r10,32(%rsi)
	dest
	movq %rbp,40(%rsi)
	dest
	movq %r14,48(%rsi)
	dest
	movq %r13,56(%rsi)
	
3:
	
	leaq 64(%rdi),%rdi
	leaq 64(%rsi),%rsi

	jnz   .Lloop

	adcq  %r9,%rax

	/* do last upto 56 bytes */
.Lhandle_tail:
	/* ecx:	count */
	movl %ecx,%r10d
	andl $63,%ecx
	shrl $3,%ecx
	jz 	 .Lfold
	clc
	.p2align 4
.Lloop_8:	
	source
	movq (%rdi),%rbx
	adcq %rbx,%rax
	decl %ecx
	dest
	movq %rbx,(%rsi)
	leaq 8(%rsi),%rsi /* preserve carry */
	leaq 8(%rdi),%rdi
	jnz	.Lloop_8
	adcq %r9,%rax	/* add in carry */

.Lfold:
	/* reduce checksum to 32bits */
	movl %eax,%ebx
	shrq $32,%rax
	addl %ebx,%eax
	adcl %r9d,%eax

	/* do last upto 6 bytes */	
.Lhandle_7:
	movl %r10d,%ecx
	andl $7,%ecx
	shrl $1,%ecx
	jz   .Lhandle_1
	movl $2,%edx
	xorl %ebx,%ebx
	clc  
	.p2align 4
.Lloop_1:	
	source
	movw (%rdi),%bx
	adcl %ebx,%eax
	decl %ecx
192
	dest
L
Linus Torvalds 已提交
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
	movw %bx,(%rsi)
	leaq 2(%rdi),%rdi
	leaq 2(%rsi),%rsi
	jnz .Lloop_1
	adcl %r9d,%eax	/* add in carry */
	
	/* handle last odd byte */
.Lhandle_1:
	testl $1,%r10d
	jz    .Lende
	xorl  %ebx,%ebx
	source
	movb (%rdi),%bl
	dest
	movb %bl,(%rsi)
	addl %ebx,%eax
	adcl %r9d,%eax		/* carry */
			
.Lende:
	movq 2*8(%rsp),%rbx
	movq 3*8(%rsp),%r12
	movq 4*8(%rsp),%r14
	movq 5*8(%rsp),%r13
	movq 6*8(%rsp),%rbp
	addq $7*8,%rsp
	ret

	/* Exception handlers. Very simple, zeroing is done in the wrappers */
.Lbad_source:
	movq (%rsp),%rax
	testq %rax,%rax
	jz   .Lende
	movl $-EFAULT,(%rax)
	jmp  .Lende
	
.Lbad_dest:
	movq 8(%rsp),%rax
	testq %rax,%rax
	jz   .Lende	
	movl $-EFAULT,(%rax)
	jmp .Lende