csum_partial.S 16.3 KB
Newer Older
A
Atsushi Nemoto 已提交
1 2 3 4 5 6 7 8 9
/*
 * This file is subject to the terms and conditions of the GNU General Public
 * License.  See the file "COPYING" in the main directory of this archive
 * for more details.
 *
 * Quick'n'dirty IP checksum ...
 *
 * Copyright (C) 1998, 1999 Ralf Baechle
 * Copyright (C) 1999 Silicon Graphics, Inc.
10
 * Copyright (C) 2007  Maciej W. Rozycki
A
Atsushi Nemoto 已提交
11
 */
12
#include <linux/errno.h>
A
Atsushi Nemoto 已提交
13
#include <asm/asm.h>
14
#include <asm/asm-offsets.h>
A
Atsushi Nemoto 已提交
15 16 17
#include <asm/regdef.h>

#ifdef CONFIG_64BIT
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
/*
 * As we are sharing code base with the mips32 tree (which use the o32 ABI
 * register definitions). We need to redefine the register definitions from
 * the n64 ABI register naming to the o32 ABI register naming.
 */
#undef t0
#undef t1
#undef t2
#undef t3
#define t0	$8
#define t1	$9
#define t2	$10
#define t3	$11
#define t4	$12
#define t5	$13
#define t6	$14
#define t7	$15
35 36

#define USE_DOUBLE
A
Atsushi Nemoto 已提交
37 38
#endif

39 40 41
#ifdef USE_DOUBLE

#define LOAD   ld
42
#define LOAD32 lwu
43 44 45 46 47 48
#define ADD    daddu
#define NBYTES 8

#else

#define LOAD   lw
49
#define LOAD32 lw
50 51 52 53 54 55 56
#define ADD    addu
#define NBYTES 4

#endif /* USE_DOUBLE */

#define UNIT(unit)  ((unit)*NBYTES)

A
Atsushi Nemoto 已提交
57
#define ADDC(sum,reg)						\
58
	ADD	sum, reg;					\
A
Atsushi Nemoto 已提交
59
	sltu	v1, sum, reg;					\
60
	ADD	sum, v1;					\
A
Atsushi Nemoto 已提交
61

62 63 64 65 66
#define ADDC32(sum,reg)						\
	addu	sum, reg;					\
	sltu	v1, sum, reg;					\
	addu	sum, v1;					\

67 68 69 70 71
#define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)	\
	LOAD	_t0, (offset + UNIT(0))(src);			\
	LOAD	_t1, (offset + UNIT(1))(src);			\
	LOAD	_t2, (offset + UNIT(2))(src); 			\
	LOAD	_t3, (offset + UNIT(3))(src); 			\
A
Atsushi Nemoto 已提交
72 73 74
	ADDC(sum, _t0);						\
	ADDC(sum, _t1);						\
	ADDC(sum, _t2);						\
75 76 77 78 79 80 81 82 83 84
	ADDC(sum, _t3)

#ifdef USE_DOUBLE
#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)	\
	CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)
#else
#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)	\
	CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3);	\
	CSUM_BIGCHUNK1(src, offset + 0x10, sum, _t0, _t1, _t2, _t3)
#endif
A
Atsushi Nemoto 已提交
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99

/*
 * a0: source address
 * a1: length of the area to checksum
 * a2: partial checksum
 */

#define src a0
#define sum v0

	.text
	.set	noreorder
	.align	5
LEAF(csum_partial)
	move	sum, zero
100
	move	t7, zero
A
Atsushi Nemoto 已提交
101 102

	sltiu	t8, a1, 0x8
103
	bnez	t8, .Lsmall_csumcpy		/* < 8 bytes to copy */
104
	 move	t2, a1
A
Atsushi Nemoto 已提交
105

106
	andi	t7, src, 0x1			/* odd buffer? */
A
Atsushi Nemoto 已提交
107

108 109
.Lhword_align:
	beqz	t7, .Lword_align
A
Atsushi Nemoto 已提交
110 111
	 andi	t8, src, 0x2

112
	lbu	t0, (src)
A
Atsushi Nemoto 已提交
113 114
	LONG_SUBU	a1, a1, 0x1
#ifdef __MIPSEL__
115
	sll	t0, t0, 8
A
Atsushi Nemoto 已提交
116
#endif
117
	ADDC(sum, t0)
A
Atsushi Nemoto 已提交
118 119 120
	PTR_ADDU	src, src, 0x1
	andi	t8, src, 0x2

121 122
.Lword_align:
	beqz	t8, .Ldword_align
A
Atsushi Nemoto 已提交
123 124
	 sltiu	t8, a1, 56

125
	lhu	t0, (src)
A
Atsushi Nemoto 已提交
126
	LONG_SUBU	a1, a1, 0x2
127
	ADDC(sum, t0)
A
Atsushi Nemoto 已提交
128 129 130
	sltiu	t8, a1, 56
	PTR_ADDU	src, src, 0x2

131 132
.Ldword_align:
	bnez	t8, .Ldo_end_words
A
Atsushi Nemoto 已提交
133 134 135
	 move	t8, a1

	andi	t8, src, 0x4
136
	beqz	t8, .Lqword_align
A
Atsushi Nemoto 已提交
137 138
	 andi	t8, src, 0x8

139
	LOAD32	t0, 0x00(src)
A
Atsushi Nemoto 已提交
140
	LONG_SUBU	a1, a1, 0x4
141
	ADDC(sum, t0)
A
Atsushi Nemoto 已提交
142 143 144
	PTR_ADDU	src, src, 0x4
	andi	t8, src, 0x8

145 146
.Lqword_align:
	beqz	t8, .Loword_align
A
Atsushi Nemoto 已提交
147 148
	 andi	t8, src, 0x10

149 150 151 152 153
#ifdef USE_DOUBLE
	ld	t0, 0x00(src)
	LONG_SUBU	a1, a1, 0x8
	ADDC(sum, t0)
#else
154 155
	lw	t0, 0x00(src)
	lw	t1, 0x04(src)
A
Atsushi Nemoto 已提交
156
	LONG_SUBU	a1, a1, 0x8
157 158
	ADDC(sum, t0)
	ADDC(sum, t1)
159
#endif
A
Atsushi Nemoto 已提交
160 161 162
	PTR_ADDU	src, src, 0x8
	andi	t8, src, 0x10

163 164
.Loword_align:
	beqz	t8, .Lbegin_movement
A
Atsushi Nemoto 已提交
165 166
	 LONG_SRL	t8, a1, 0x7

167 168 169
#ifdef USE_DOUBLE
	ld	t0, 0x00(src)
	ld	t1, 0x08(src)
170 171
	ADDC(sum, t0)
	ADDC(sum, t1)
172 173 174
#else
	CSUM_BIGCHUNK1(src, 0x00, sum, t0, t1, t3, t4)
#endif
A
Atsushi Nemoto 已提交
175 176 177 178
	LONG_SUBU	a1, a1, 0x10
	PTR_ADDU	src, src, 0x10
	LONG_SRL	t8, a1, 0x7

179
.Lbegin_movement:
A
Atsushi Nemoto 已提交
180
	beqz	t8, 1f
181
	 andi	t2, a1, 0x40
A
Atsushi Nemoto 已提交
182

183
.Lmove_128bytes:
184 185 186 187
	CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
	CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
	CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
	CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
A
Atsushi Nemoto 已提交
188
	LONG_SUBU	t8, t8, 0x01
189 190
	.set	reorder				/* DADDI_WAR */
	PTR_ADDU	src, src, 0x80
191
	bnez	t8, .Lmove_128bytes
192
	.set	noreorder
A
Atsushi Nemoto 已提交
193 194

1:
195 196
	beqz	t2, 1f
	 andi	t2, a1, 0x20
A
Atsushi Nemoto 已提交
197

198
.Lmove_64bytes:
199 200
	CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
	CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
A
Atsushi Nemoto 已提交
201 202 203
	PTR_ADDU	src, src, 0x40

1:
204
	beqz	t2, .Ldo_end_words
A
Atsushi Nemoto 已提交
205 206
	 andi	t8, a1, 0x1c

207
.Lmove_32bytes:
208
	CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
A
Atsushi Nemoto 已提交
209 210 211
	andi	t8, a1, 0x1c
	PTR_ADDU	src, src, 0x20

212 213
.Ldo_end_words:
	beqz	t8, .Lsmall_csumcpy
214 215
	 andi	t2, a1, 0x3
	LONG_SRL	t8, t8, 0x2
A
Atsushi Nemoto 已提交
216

217
.Lend_words:
218
	LOAD32	t0, (src)
A
Atsushi Nemoto 已提交
219
	LONG_SUBU	t8, t8, 0x1
220
	ADDC(sum, t0)
221 222
	.set	reorder				/* DADDI_WAR */
	PTR_ADDU	src, src, 0x4
223
	bnez	t8, .Lend_words
224
	.set	noreorder
A
Atsushi Nemoto 已提交
225

226
/* unknown src alignment and < 8 bytes to go  */
227
.Lsmall_csumcpy:
228
	move	a1, t2
A
Atsushi Nemoto 已提交
229

230 231 232
	andi	t0, a1, 4
	beqz	t0, 1f
	 andi	t0, a1, 2
A
Atsushi Nemoto 已提交
233

234 235 236
	/* Still a full word to go  */
	ulw	t1, (src)
	PTR_ADDIU	src, 4
237 238 239
#ifdef USE_DOUBLE
	dsll	t1, t1, 32			/* clear lower 32bit */
#endif
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
	ADDC(sum, t1)

1:	move	t1, zero
	beqz	t0, 1f
	 andi	t0, a1, 1

	/* Still a halfword to go  */
	ulhu	t1, (src)
	PTR_ADDIU	src, 2

1:	beqz	t0, 1f
	 sll	t1, t1, 16

	lbu	t2, (src)
	 nop

#ifdef __MIPSEB__
	sll	t2, t2, 8
#endif
	or	t1, t2

1:	ADDC(sum, t1)
A
Atsushi Nemoto 已提交
262

263
	/* fold checksum */
264 265 266 267 268 269 270
#ifdef USE_DOUBLE
	dsll32	v1, sum, 0
	daddu	sum, v1
	sltu	v1, sum, v1
	dsra32	sum, sum, 0
	addu	sum, v1
#endif
271 272

	/* odd buffer alignment? */
273 274 275 276 277 278 279 280 281
#ifdef CPU_MIPSR2
	wsbh	v1, sum
	movn	sum, v1, t7
#else
	beqz	t7, 1f			/* odd buffer alignment? */
	 lui	v1, 0x00ff
	addu	v1, 0x00ff
	and	t0, sum, v1
	sll	t0, t0, 8
282
	srl	sum, sum, 8
283 284
	and	sum, sum, v1
	or	sum, sum, t0
285
1:
286
#endif
287 288
	.set	reorder
	/* Add the passed partial csum.  */
289
	ADDC32(sum, a2)
A
Atsushi Nemoto 已提交
290
	jr	ra
291
	.set	noreorder
A
Atsushi Nemoto 已提交
292
	END(csum_partial)
293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392


/*
 * checksum and copy routines based on memcpy.S
 *
 *	csum_partial_copy_nocheck(src, dst, len, sum)
 *	__csum_partial_copy_user(src, dst, len, sum, errp)
 *
 * See "Spec" in memcpy.S for details.  Unlike __copy_user, all
 * function in this file use the standard calling convention.
 */

#define src a0
#define dst a1
#define len a2
#define psum a3
#define sum v0
#define odd t8
#define errptr t9

/*
 * The exception handler for loads requires that:
 *  1- AT contain the address of the byte just past the end of the source
 *     of the copy,
 *  2- src_entry <= src < AT, and
 *  3- (dst - src) == (dst_entry - src_entry),
 * The _entry suffix denotes values when __copy_user was called.
 *
 * (1) is set up up by __csum_partial_copy_from_user and maintained by
 *	not writing AT in __csum_partial_copy
 * (2) is met by incrementing src by the number of bytes copied
 * (3) is met by not doing loads between a pair of increments of dst and src
 *
 * The exception handlers for stores stores -EFAULT to errptr and return.
 * These handlers do not need to overwrite any data.
 */

#define EXC(inst_reg,addr,handler)		\
9:	inst_reg, addr;				\
	.section __ex_table,"a";		\
	PTR	9b, handler;			\
	.previous

#ifdef USE_DOUBLE

#define LOAD   ld
#define LOADL  ldl
#define LOADR  ldr
#define STOREL sdl
#define STORER sdr
#define STORE  sd
#define ADD    daddu
#define SUB    dsubu
#define SRL    dsrl
#define SLL    dsll
#define SLLV   dsllv
#define SRLV   dsrlv
#define NBYTES 8
#define LOG_NBYTES 3

#else

#define LOAD   lw
#define LOADL  lwl
#define LOADR  lwr
#define STOREL swl
#define STORER swr
#define STORE  sw
#define ADD    addu
#define SUB    subu
#define SRL    srl
#define SLL    sll
#define SLLV   sllv
#define SRLV   srlv
#define NBYTES 4
#define LOG_NBYTES 2

#endif /* USE_DOUBLE */

#ifdef CONFIG_CPU_LITTLE_ENDIAN
#define LDFIRST LOADR
#define LDREST  LOADL
#define STFIRST STORER
#define STREST  STOREL
#define SHIFT_DISCARD SLLV
#define SHIFT_DISCARD_REVERT SRLV
#else
#define LDFIRST LOADL
#define LDREST  LOADR
#define STFIRST STOREL
#define STREST  STORER
#define SHIFT_DISCARD SRLV
#define SHIFT_DISCARD_REVERT SLLV
#endif

#define FIRST(unit) ((unit)*NBYTES)
#define REST(unit)  (FIRST(unit)+NBYTES-1)

#define ADDRMASK (NBYTES-1)

393
#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
394
	.set	noat
395 396 397
#else
	.set	at=v1
#endif
398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421

LEAF(__csum_partial_copy_user)
	PTR_ADDU	AT, src, len	/* See (1) above. */
#ifdef CONFIG_64BIT
	move	errptr, a4
#else
	lw	errptr, 16(sp)
#endif
FEXPORT(csum_partial_copy_nocheck)
	move	sum, zero
	move	odd, zero
	/*
	 * Note: dst & src may be unaligned, len may be 0
	 * Temps
	 */
	/*
	 * The "issue break"s below are very approximate.
	 * Issue delays for dcache fills will perturb the schedule, as will
	 * load queue full replay traps, etc.
	 *
	 * If len < NBYTES use byte operations.
	 */
	sltu	t2, len, NBYTES
	and	t1, dst, ADDRMASK
422
	bnez	t2, .Lcopy_bytes_checklen
423 424
	 and	t0, src, ADDRMASK
	andi	odd, dst, 0x1			/* odd buffer? */
425
	bnez	t1, .Ldst_unaligned
426
	 nop
427
	bnez	t0, .Lsrc_unaligned_dst_aligned
428 429 430 431
	/*
	 * use delay slot for fall-through
	 * src and dst are aligned; need to compute rem
	 */
432
.Lboth_aligned:
433
	 SRL	t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
434
	beqz	t0, .Lcleanup_both_aligned # len < 8*NBYTES
435 436 437 438
	 nop
	SUB	len, 8*NBYTES		# subtract here for bgez loop
	.align	4
1:
439 440 441 442 443 444 445 446
EXC(	LOAD	t0, UNIT(0)(src),	.Ll_exc)
EXC(	LOAD	t1, UNIT(1)(src),	.Ll_exc_copy)
EXC(	LOAD	t2, UNIT(2)(src),	.Ll_exc_copy)
EXC(	LOAD	t3, UNIT(3)(src),	.Ll_exc_copy)
EXC(	LOAD	t4, UNIT(4)(src),	.Ll_exc_copy)
EXC(	LOAD	t5, UNIT(5)(src),	.Ll_exc_copy)
EXC(	LOAD	t6, UNIT(6)(src),	.Ll_exc_copy)
EXC(	LOAD	t7, UNIT(7)(src),	.Ll_exc_copy)
447 448
	SUB	len, len, 8*NBYTES
	ADD	src, src, 8*NBYTES
449
EXC(	STORE	t0, UNIT(0)(dst),	.Ls_exc)
450
	ADDC(sum, t0)
451
EXC(	STORE	t1, UNIT(1)(dst),	.Ls_exc)
452
	ADDC(sum, t1)
453
EXC(	STORE	t2, UNIT(2)(dst),	.Ls_exc)
454
	ADDC(sum, t2)
455
EXC(	STORE	t3, UNIT(3)(dst),	.Ls_exc)
456
	ADDC(sum, t3)
457
EXC(	STORE	t4, UNIT(4)(dst),	.Ls_exc)
458
	ADDC(sum, t4)
459
EXC(	STORE	t5, UNIT(5)(dst),	.Ls_exc)
460
	ADDC(sum, t5)
461
EXC(	STORE	t6, UNIT(6)(dst),	.Ls_exc)
462
	ADDC(sum, t6)
463
EXC(	STORE	t7, UNIT(7)(dst),	.Ls_exc)
464
	ADDC(sum, t7)
465 466
	.set	reorder				/* DADDI_WAR */
	ADD	dst, dst, 8*NBYTES
467
	bgez	len, 1b
468
	.set	noreorder
469 470 471 472 473
	ADD	len, 8*NBYTES		# revert len (see above)

	/*
	 * len == the number of bytes left to copy < 8*NBYTES
	 */
474
.Lcleanup_both_aligned:
475
#define rem t7
476
	beqz	len, .Ldone
477
	 sltu	t0, len, 4*NBYTES
478
	bnez	t0, .Lless_than_4units
479 480 481 482
	 and	rem, len, (NBYTES-1)	# rem = len % NBYTES
	/*
	 * len >= 4*NBYTES
	 */
483 484 485 486
EXC(	LOAD	t0, UNIT(0)(src),	.Ll_exc)
EXC(	LOAD	t1, UNIT(1)(src),	.Ll_exc_copy)
EXC(	LOAD	t2, UNIT(2)(src),	.Ll_exc_copy)
EXC(	LOAD	t3, UNIT(3)(src),	.Ll_exc_copy)
487 488
	SUB	len, len, 4*NBYTES
	ADD	src, src, 4*NBYTES
489
EXC(	STORE	t0, UNIT(0)(dst),	.Ls_exc)
490
	ADDC(sum, t0)
491
EXC(	STORE	t1, UNIT(1)(dst),	.Ls_exc)
492
	ADDC(sum, t1)
493
EXC(	STORE	t2, UNIT(2)(dst),	.Ls_exc)
494
	ADDC(sum, t2)
495
EXC(	STORE	t3, UNIT(3)(dst),	.Ls_exc)
496
	ADDC(sum, t3)
497 498
	.set	reorder				/* DADDI_WAR */
	ADD	dst, dst, 4*NBYTES
499
	beqz	len, .Ldone
500
	.set	noreorder
501
.Lless_than_4units:
502 503 504
	/*
	 * rem = len % NBYTES
	 */
505
	beq	rem, len, .Lcopy_bytes
506 507
	 nop
1:
508
EXC(	LOAD	t0, 0(src),		.Ll_exc)
509 510
	ADD	src, src, NBYTES
	SUB	len, len, NBYTES
511
EXC(	STORE	t0, 0(dst),		.Ls_exc)
512
	ADDC(sum, t0)
513 514
	.set	reorder				/* DADDI_WAR */
	ADD	dst, dst, NBYTES
515
	bne	rem, len, 1b
516
	.set	noreorder
517 518 519 520 521 522 523 524 525 526 527 528 529

	/*
	 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
	 * A loop would do only a byte at a time with possible branch
	 * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
	 * because can't assume read-access to dst.  Instead, use
	 * STREST dst, which doesn't require read access to dst.
	 *
	 * This code should perform better than a simple loop on modern,
	 * wide-issue mips processors because the code has fewer branches and
	 * more instruction-level parallelism.
	 */
#define bits t2
530
	beqz	len, .Ldone
531 532 533
	 ADD	t1, dst, len	# t1 is just past last byte of dst
	li	bits, 8*NBYTES
	SLL	rem, len, 3	# rem = number of bits to keep
534
EXC(	LOAD	t0, 0(src),		.Ll_exc)
535 536
	SUB	bits, bits, rem	# bits = number of bits to discard
	SHIFT_DISCARD t0, t0, bits
537
EXC(	STREST	t0, -1(t1),		.Ls_exc)
538 539 540
	SHIFT_DISCARD_REVERT t0, t0, bits
	.set reorder
	ADDC(sum, t0)
541
	b	.Ldone
542
	.set noreorder
543
.Ldst_unaligned:
544 545 546 547 548 549 550 551 552 553
	/*
	 * dst is unaligned
	 * t0 = src & ADDRMASK
	 * t1 = dst & ADDRMASK; T1 > 0
	 * len >= NBYTES
	 *
	 * Copy enough bytes to align dst
	 * Set match = (src and dst have same alignment)
	 */
#define match rem
554
EXC(	LDFIRST	t3, FIRST(0)(src),	.Ll_exc)
555
	ADD	t2, zero, NBYTES
556
EXC(	LDREST	t3, REST(0)(src),	.Ll_exc_copy)
557 558
	SUB	t2, t2, t1	# t2 = number of bytes copied
	xor	match, t0, t1
559
EXC(	STFIRST t3, FIRST(0)(dst),	.Ls_exc)
560 561 562 563
	SLL	t4, t1, 3		# t4 = number of bits to discard
	SHIFT_DISCARD t3, t3, t4
	/* no SHIFT_DISCARD_REVERT to handle odd buffer properly */
	ADDC(sum, t3)
564
	beq	len, t2, .Ldone
565 566
	 SUB	len, len, t2
	ADD	dst, dst, t2
567
	beqz	match, .Lboth_aligned
568 569
	 ADD	src, src, t2

570
.Lsrc_unaligned_dst_aligned:
571
	SRL	t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
572
	beqz	t0, .Lcleanup_src_unaligned
573 574 575 576 577 578 579 580
	 and	rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
1:
/*
 * Avoid consecutive LD*'s to the same register since some mips
 * implementations can't issue them in the same cycle.
 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 * are to the same unit (unless src is aligned, but it's not).
 */
581 582
EXC(	LDFIRST	t0, FIRST(0)(src),	.Ll_exc)
EXC(	LDFIRST	t1, FIRST(1)(src),	.Ll_exc_copy)
583
	SUB     len, len, 4*NBYTES
584 585 586 587 588 589
EXC(	LDREST	t0, REST(0)(src),	.Ll_exc_copy)
EXC(	LDREST	t1, REST(1)(src),	.Ll_exc_copy)
EXC(	LDFIRST	t2, FIRST(2)(src),	.Ll_exc_copy)
EXC(	LDFIRST	t3, FIRST(3)(src),	.Ll_exc_copy)
EXC(	LDREST	t2, REST(2)(src),	.Ll_exc_copy)
EXC(	LDREST	t3, REST(3)(src),	.Ll_exc_copy)
590 591 592 593
	ADD	src, src, 4*NBYTES
#ifdef CONFIG_CPU_SB1
	nop				# improves slotting
#endif
594
EXC(	STORE	t0, UNIT(0)(dst),	.Ls_exc)
595
	ADDC(sum, t0)
596
EXC(	STORE	t1, UNIT(1)(dst),	.Ls_exc)
597
	ADDC(sum, t1)
598
EXC(	STORE	t2, UNIT(2)(dst),	.Ls_exc)
599
	ADDC(sum, t2)
600
EXC(	STORE	t3, UNIT(3)(dst),	.Ls_exc)
601
	ADDC(sum, t3)
602 603
	.set	reorder				/* DADDI_WAR */
	ADD	dst, dst, 4*NBYTES
604
	bne	len, rem, 1b
605
	.set	noreorder
606

607 608
.Lcleanup_src_unaligned:
	beqz	len, .Ldone
609
	 and	rem, len, NBYTES-1  # rem = len % NBYTES
610
	beq	rem, len, .Lcopy_bytes
611 612
	 nop
1:
613 614
EXC(	LDFIRST t0, FIRST(0)(src),	.Ll_exc)
EXC(	LDREST	t0, REST(0)(src),	.Ll_exc_copy)
615 616
	ADD	src, src, NBYTES
	SUB	len, len, NBYTES
617
EXC(	STORE	t0, 0(dst),		.Ls_exc)
618
	ADDC(sum, t0)
619 620
	.set	reorder				/* DADDI_WAR */
	ADD	dst, dst, NBYTES
621
	bne	len, rem, 1b
622
	.set	noreorder
623

624 625
.Lcopy_bytes_checklen:
	beqz	len, .Ldone
626
	 nop
627
.Lcopy_bytes:
628 629 630 631 632 633 634 635 636 637
	/* 0 < len < NBYTES  */
#ifdef CONFIG_CPU_LITTLE_ENDIAN
#define SHIFT_START 0
#define SHIFT_INC 8
#else
#define SHIFT_START 8*(NBYTES-1)
#define SHIFT_INC -8
#endif
	move	t2, zero	# partial word
	li	t3, SHIFT_START	# shift
638
/* use .Ll_exc_copy here to return correct sum on fault */
639
#define COPY_BYTE(N)			\
640
EXC(	lbu	t0, N(src), .Ll_exc_copy);	\
641
	SUB	len, len, 1;		\
642
EXC(	sb	t0, N(dst), .Ls_exc);	\
643 644
	SLLV	t0, t0, t3;		\
	addu	t3, SHIFT_INC;		\
645
	beqz	len, .Lcopy_bytes_done;	\
646 647 648 649 650 651 652 653 654 655
	 or	t2, t0

	COPY_BYTE(0)
	COPY_BYTE(1)
#ifdef USE_DOUBLE
	COPY_BYTE(2)
	COPY_BYTE(3)
	COPY_BYTE(4)
	COPY_BYTE(5)
#endif
656
EXC(	lbu	t0, NBYTES-2(src), .Ll_exc_copy)
657
	SUB	len, len, 1
658
EXC(	sb	t0, NBYTES-2(dst), .Ls_exc)
659 660
	SLLV	t0, t0, t3
	or	t2, t0
661
.Lcopy_bytes_done:
662
	ADDC(sum, t2)
663
.Ldone:
664 665 666 667 668 669 670 671 672
	/* fold checksum */
#ifdef USE_DOUBLE
	dsll32	v1, sum, 0
	daddu	sum, v1
	sltu	v1, sum, v1
	dsra32	sum, sum, 0
	addu	sum, v1
#endif

673 674 675 676 677 678 679 680 681
#ifdef CPU_MIPSR2
	wsbh	v1, sum
	movn	sum, v1, odd
#else
	beqz	odd, 1f			/* odd buffer alignment? */
	 lui	v1, 0x00ff
	addu	v1, 0x00ff
	and	t0, sum, v1
	sll	t0, t0, 8
682
	srl	sum, sum, 8
683 684
	and	sum, sum, v1
	or	sum, sum, t0
685
1:
686
#endif
687
	.set reorder
688
	ADDC32(sum, psum)
689 690 691
	jr	ra
	.set noreorder

692
.Ll_exc_copy:
693 694 695 696 697 698 699 700 701 702 703 704 705 706
	/*
	 * Copy bytes from src until faulting load address (or until a
	 * lb faults)
	 *
	 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
	 * may be more than a byte beyond the last address.
	 * Hence, the lb below may get an exception.
	 *
	 * Assumes src < THREAD_BUADDR($28)
	 */
	LOAD	t0, TI_TASK($28)
	 li	t2, SHIFT_START
	LOAD	t0, THREAD_BUADDR(t0)
1:
707
EXC(	lbu	t1, 0(src),	.Ll_exc)
708 709 710 711 712
	ADD	src, src, 1
	sb	t1, 0(dst)	# can't fault -- we're copy_from_user
	SLLV	t1, t1, t2
	addu	t2, SHIFT_INC
	ADDC(sum, t1)
713 714
	.set	reorder				/* DADDI_WAR */
	ADD	dst, dst, 1
715
	bne	src, t0, 1b
716
	.set	noreorder
717
.Ll_exc:
718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733
	LOAD	t0, TI_TASK($28)
	 nop
	LOAD	t0, THREAD_BUADDR(t0)	# t0 is just past last good address
	 nop
	SUB	len, AT, t0		# len number of uncopied bytes
	/*
	 * Here's where we rely on src and dst being incremented in tandem,
	 *   See (3) above.
	 * dst += (fault addr - src) to put dst at first byte to clear
	 */
	ADD	dst, t0			# compute start address in a1
	SUB	dst, src
	/*
	 * Clear len bytes starting at dst.  Can't call __bzero because it
	 * might modify len.  An inefficient loop for these rare times...
	 */
734 735
	.set	reorder				/* DADDI_WAR */
	SUB	src, len, 1
736
	beqz	len, .Ldone
737
	.set	noreorder
738 739
1:	sb	zero, 0(dst)
	ADD	dst, dst, 1
740 741 742
	.set	push
	.set	noat
#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
743 744
	bnez	src, 1b
	 SUB	src, src, 1
745 746 747 748 749
#else
	li	v1, 1
	bnez	src, 1b
	 SUB	src, src, v1
#endif
750
	li	v1, -EFAULT
751
	b	.Ldone
752 753
	 sw	v1, (errptr)

754
.Ls_exc:
755 756 757 758
	li	v0, -1 /* invalid checksum */
	li	v1, -EFAULT
	jr	ra
	 sw	v1, (errptr)
759
	.set	pop
760
	END(__csum_partial_copy_user)