csum_partial.S 16.5 KB
Newer Older
A
Atsushi Nemoto 已提交
1 2 3 4 5 6 7 8 9
/*
 * This file is subject to the terms and conditions of the GNU General Public
 * License.  See the file "COPYING" in the main directory of this archive
 * for more details.
 *
 * Quick'n'dirty IP checksum ...
 *
 * Copyright (C) 1998, 1999 Ralf Baechle
 * Copyright (C) 1999 Silicon Graphics, Inc.
10
 * Copyright (C) 2007  Maciej W. Rozycki
11
 * Copyright (C) 2014 Imagination Technologies Ltd.
A
Atsushi Nemoto 已提交
12
 */
13
#include <linux/errno.h>
A
Atsushi Nemoto 已提交
14
#include <asm/asm.h>
15
#include <asm/asm-offsets.h>
A
Atsushi Nemoto 已提交
16 17 18
#include <asm/regdef.h>

#ifdef CONFIG_64BIT
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
/*
 * As we are sharing code base with the mips32 tree (which use the o32 ABI
 * register definitions). We need to redefine the register definitions from
 * the n64 ABI register naming to the o32 ABI register naming.
 */
#undef t0
#undef t1
#undef t2
#undef t3
#define t0	$8
#define t1	$9
#define t2	$10
#define t3	$11
#define t4	$12
#define t5	$13
#define t6	$14
#define t7	$15
36 37

#define USE_DOUBLE
A
Atsushi Nemoto 已提交
38 39
#endif

40 41 42
#ifdef USE_DOUBLE

#define LOAD   ld
43
#define LOAD32 lwu
44 45 46 47 48 49
#define ADD    daddu
#define NBYTES 8

#else

#define LOAD   lw
50
#define LOAD32 lw
51 52 53 54 55 56 57
#define ADD    addu
#define NBYTES 4

#endif /* USE_DOUBLE */

#define UNIT(unit)  ((unit)*NBYTES)

A
Atsushi Nemoto 已提交
58
#define ADDC(sum,reg)						\
59
	ADD	sum, reg;					\
A
Atsushi Nemoto 已提交
60
	sltu	v1, sum, reg;					\
61
	ADD	sum, v1;					\
A
Atsushi Nemoto 已提交
62

63 64 65 66 67
#define ADDC32(sum,reg)						\
	addu	sum, reg;					\
	sltu	v1, sum, reg;					\
	addu	sum, v1;					\

68 69 70
#define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)	\
	LOAD	_t0, (offset + UNIT(0))(src);			\
	LOAD	_t1, (offset + UNIT(1))(src);			\
R
Ralf Baechle 已提交
71 72
	LOAD	_t2, (offset + UNIT(2))(src);			\
	LOAD	_t3, (offset + UNIT(3))(src);			\
A
Atsushi Nemoto 已提交
73 74 75
	ADDC(sum, _t0);						\
	ADDC(sum, _t1);						\
	ADDC(sum, _t2);						\
76 77 78 79 80 81 82 83 84 85
	ADDC(sum, _t3)

#ifdef USE_DOUBLE
#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)	\
	CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)
#else
#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)	\
	CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3);	\
	CSUM_BIGCHUNK1(src, offset + 0x10, sum, _t0, _t1, _t2, _t3)
#endif
A
Atsushi Nemoto 已提交
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100

/*
 * a0: source address
 * a1: length of the area to checksum
 * a2: partial checksum
 */

#define src a0
#define sum v0

	.text
	.set	noreorder
	.align	5
LEAF(csum_partial)
	move	sum, zero
101
	move	t7, zero
A
Atsushi Nemoto 已提交
102 103

	sltiu	t8, a1, 0x8
104
	bnez	t8, .Lsmall_csumcpy		/* < 8 bytes to copy */
105
	 move	t2, a1
A
Atsushi Nemoto 已提交
106

107
	andi	t7, src, 0x1			/* odd buffer? */
A
Atsushi Nemoto 已提交
108

109 110
.Lhword_align:
	beqz	t7, .Lword_align
A
Atsushi Nemoto 已提交
111 112
	 andi	t8, src, 0x2

113
	lbu	t0, (src)
A
Atsushi Nemoto 已提交
114 115
	LONG_SUBU	a1, a1, 0x1
#ifdef __MIPSEL__
116
	sll	t0, t0, 8
A
Atsushi Nemoto 已提交
117
#endif
118
	ADDC(sum, t0)
A
Atsushi Nemoto 已提交
119 120 121
	PTR_ADDU	src, src, 0x1
	andi	t8, src, 0x2

122 123
.Lword_align:
	beqz	t8, .Ldword_align
A
Atsushi Nemoto 已提交
124 125
	 sltiu	t8, a1, 56

126
	lhu	t0, (src)
A
Atsushi Nemoto 已提交
127
	LONG_SUBU	a1, a1, 0x2
128
	ADDC(sum, t0)
A
Atsushi Nemoto 已提交
129 130 131
	sltiu	t8, a1, 56
	PTR_ADDU	src, src, 0x2

132 133
.Ldword_align:
	bnez	t8, .Ldo_end_words
A
Atsushi Nemoto 已提交
134 135 136
	 move	t8, a1

	andi	t8, src, 0x4
137
	beqz	t8, .Lqword_align
A
Atsushi Nemoto 已提交
138 139
	 andi	t8, src, 0x8

140
	LOAD32	t0, 0x00(src)
A
Atsushi Nemoto 已提交
141
	LONG_SUBU	a1, a1, 0x4
142
	ADDC(sum, t0)
A
Atsushi Nemoto 已提交
143 144 145
	PTR_ADDU	src, src, 0x4
	andi	t8, src, 0x8

146 147
.Lqword_align:
	beqz	t8, .Loword_align
A
Atsushi Nemoto 已提交
148 149
	 andi	t8, src, 0x10

150 151 152 153 154
#ifdef USE_DOUBLE
	ld	t0, 0x00(src)
	LONG_SUBU	a1, a1, 0x8
	ADDC(sum, t0)
#else
155 156
	lw	t0, 0x00(src)
	lw	t1, 0x04(src)
A
Atsushi Nemoto 已提交
157
	LONG_SUBU	a1, a1, 0x8
158 159
	ADDC(sum, t0)
	ADDC(sum, t1)
160
#endif
A
Atsushi Nemoto 已提交
161 162 163
	PTR_ADDU	src, src, 0x8
	andi	t8, src, 0x10

164 165
.Loword_align:
	beqz	t8, .Lbegin_movement
A
Atsushi Nemoto 已提交
166 167
	 LONG_SRL	t8, a1, 0x7

168 169 170
#ifdef USE_DOUBLE
	ld	t0, 0x00(src)
	ld	t1, 0x08(src)
171 172
	ADDC(sum, t0)
	ADDC(sum, t1)
173 174 175
#else
	CSUM_BIGCHUNK1(src, 0x00, sum, t0, t1, t3, t4)
#endif
A
Atsushi Nemoto 已提交
176 177 178 179
	LONG_SUBU	a1, a1, 0x10
	PTR_ADDU	src, src, 0x10
	LONG_SRL	t8, a1, 0x7

180
.Lbegin_movement:
A
Atsushi Nemoto 已提交
181
	beqz	t8, 1f
182
	 andi	t2, a1, 0x40
A
Atsushi Nemoto 已提交
183

184
.Lmove_128bytes:
185 186 187 188
	CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
	CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
	CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
	CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
A
Atsushi Nemoto 已提交
189
	LONG_SUBU	t8, t8, 0x01
190 191
	.set	reorder				/* DADDI_WAR */
	PTR_ADDU	src, src, 0x80
192
	bnez	t8, .Lmove_128bytes
193
	.set	noreorder
A
Atsushi Nemoto 已提交
194 195

1:
196 197
	beqz	t2, 1f
	 andi	t2, a1, 0x20
A
Atsushi Nemoto 已提交
198

199
.Lmove_64bytes:
200 201
	CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
	CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
A
Atsushi Nemoto 已提交
202 203 204
	PTR_ADDU	src, src, 0x40

1:
205
	beqz	t2, .Ldo_end_words
A
Atsushi Nemoto 已提交
206 207
	 andi	t8, a1, 0x1c

208
.Lmove_32bytes:
209
	CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
A
Atsushi Nemoto 已提交
210 211 212
	andi	t8, a1, 0x1c
	PTR_ADDU	src, src, 0x20

213 214
.Ldo_end_words:
	beqz	t8, .Lsmall_csumcpy
215 216
	 andi	t2, a1, 0x3
	LONG_SRL	t8, t8, 0x2
A
Atsushi Nemoto 已提交
217

218
.Lend_words:
219
	LOAD32	t0, (src)
A
Atsushi Nemoto 已提交
220
	LONG_SUBU	t8, t8, 0x1
221
	ADDC(sum, t0)
222 223
	.set	reorder				/* DADDI_WAR */
	PTR_ADDU	src, src, 0x4
224
	bnez	t8, .Lend_words
225
	.set	noreorder
A
Atsushi Nemoto 已提交
226

227
/* unknown src alignment and < 8 bytes to go  */
228
.Lsmall_csumcpy:
229
	move	a1, t2
A
Atsushi Nemoto 已提交
230

231 232 233
	andi	t0, a1, 4
	beqz	t0, 1f
	 andi	t0, a1, 2
A
Atsushi Nemoto 已提交
234

235 236 237
	/* Still a full word to go  */
	ulw	t1, (src)
	PTR_ADDIU	src, 4
238 239 240
#ifdef USE_DOUBLE
	dsll	t1, t1, 32			/* clear lower 32bit */
#endif
241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262
	ADDC(sum, t1)

1:	move	t1, zero
	beqz	t0, 1f
	 andi	t0, a1, 1

	/* Still a halfword to go  */
	ulhu	t1, (src)
	PTR_ADDIU	src, 2

1:	beqz	t0, 1f
	 sll	t1, t1, 16

	lbu	t2, (src)
	 nop

#ifdef __MIPSEB__
	sll	t2, t2, 8
#endif
	or	t1, t2

1:	ADDC(sum, t1)
A
Atsushi Nemoto 已提交
263

264
	/* fold checksum */
265 266 267 268 269 270 271
#ifdef USE_DOUBLE
	dsll32	v1, sum, 0
	daddu	sum, v1
	sltu	v1, sum, v1
	dsra32	sum, sum, 0
	addu	sum, v1
#endif
272 273

	/* odd buffer alignment? */
274
#ifdef CONFIG_CPU_MIPSR2
275 276 277 278 279 280 281 282
	wsbh	v1, sum
	movn	sum, v1, t7
#else
	beqz	t7, 1f			/* odd buffer alignment? */
	 lui	v1, 0x00ff
	addu	v1, 0x00ff
	and	t0, sum, v1
	sll	t0, t0, 8
283
	srl	sum, sum, 8
284 285
	and	sum, sum, v1
	or	sum, sum, t0
286
1:
287
#endif
288
	.set	reorder
R
Ralf Baechle 已提交
289
	/* Add the passed partial csum.	 */
290
	ADDC32(sum, a2)
A
Atsushi Nemoto 已提交
291
	jr	ra
292
	.set	noreorder
A
Atsushi Nemoto 已提交
293
	END(csum_partial)
294 295 296 297 298 299


/*
 * checksum and copy routines based on memcpy.S
 *
 *	csum_partial_copy_nocheck(src, dst, len, sum)
300
 *	__csum_partial_copy_kernel(src, dst, len, sum, errp)
301
 *
R
Ralf Baechle 已提交
302
 * See "Spec" in memcpy.S for details.	Unlike __copy_user, all
303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374
 * function in this file use the standard calling convention.
 */

#define src a0
#define dst a1
#define len a2
#define psum a3
#define sum v0
#define odd t8
#define errptr t9

/*
 * The exception handler for loads requires that:
 *  1- AT contain the address of the byte just past the end of the source
 *     of the copy,
 *  2- src_entry <= src < AT, and
 *  3- (dst - src) == (dst_entry - src_entry),
 * The _entry suffix denotes values when __copy_user was called.
 *
 * (1) is set up up by __csum_partial_copy_from_user and maintained by
 *	not writing AT in __csum_partial_copy
 * (2) is met by incrementing src by the number of bytes copied
 * (3) is met by not doing loads between a pair of increments of dst and src
 *
 * The exception handlers for stores stores -EFAULT to errptr and return.
 * These handlers do not need to overwrite any data.
 */

#define EXC(inst_reg,addr,handler)		\
9:	inst_reg, addr;				\
	.section __ex_table,"a";		\
	PTR	9b, handler;			\
	.previous

#ifdef USE_DOUBLE

#define LOAD   ld
#define LOADL  ldl
#define LOADR  ldr
#define STOREL sdl
#define STORER sdr
#define STORE  sd
#define ADD    daddu
#define SUB    dsubu
#define SRL    dsrl
#define SLL    dsll
#define SLLV   dsllv
#define SRLV   dsrlv
#define NBYTES 8
#define LOG_NBYTES 3

#else

#define LOAD   lw
#define LOADL  lwl
#define LOADR  lwr
#define STOREL swl
#define STORER swr
#define STORE  sw
#define ADD    addu
#define SUB    subu
#define SRL    srl
#define SLL    sll
#define SLLV   sllv
#define SRLV   srlv
#define NBYTES 4
#define LOG_NBYTES 2

#endif /* USE_DOUBLE */

#ifdef CONFIG_CPU_LITTLE_ENDIAN
#define LDFIRST LOADR
R
Ralf Baechle 已提交
375
#define LDREST	LOADL
376
#define STFIRST STORER
R
Ralf Baechle 已提交
377
#define STREST	STOREL
378 379 380 381
#define SHIFT_DISCARD SLLV
#define SHIFT_DISCARD_REVERT SRLV
#else
#define LDFIRST LOADL
R
Ralf Baechle 已提交
382
#define LDREST	LOADR
383
#define STFIRST STOREL
R
Ralf Baechle 已提交
384
#define STREST	STORER
385 386 387 388 389 390 391 392 393
#define SHIFT_DISCARD SRLV
#define SHIFT_DISCARD_REVERT SLLV
#endif

#define FIRST(unit) ((unit)*NBYTES)
#define REST(unit)  (FIRST(unit)+NBYTES-1)

#define ADDRMASK (NBYTES-1)

394
#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
395
	.set	noat
396 397 398
#else
	.set	at=v1
#endif
399

400 401 402
LEAF(__csum_partial_copy_kernel)
FEXPORT(__csum_partial_copy_to_user)
FEXPORT(__csum_partial_copy_from_user)
403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424
	PTR_ADDU	AT, src, len	/* See (1) above. */
#ifdef CONFIG_64BIT
	move	errptr, a4
#else
	lw	errptr, 16(sp)
#endif
FEXPORT(csum_partial_copy_nocheck)
	move	sum, zero
	move	odd, zero
	/*
	 * Note: dst & src may be unaligned, len may be 0
	 * Temps
	 */
	/*
	 * The "issue break"s below are very approximate.
	 * Issue delays for dcache fills will perturb the schedule, as will
	 * load queue full replay traps, etc.
	 *
	 * If len < NBYTES use byte operations.
	 */
	sltu	t2, len, NBYTES
	and	t1, dst, ADDRMASK
425
	bnez	t2, .Lcopy_bytes_checklen
426 427
	 and	t0, src, ADDRMASK
	andi	odd, dst, 0x1			/* odd buffer? */
428
	bnez	t1, .Ldst_unaligned
429
	 nop
430
	bnez	t0, .Lsrc_unaligned_dst_aligned
431 432 433 434
	/*
	 * use delay slot for fall-through
	 * src and dst are aligned; need to compute rem
	 */
435
.Lboth_aligned:
R
Ralf Baechle 已提交
436
	 SRL	t0, len, LOG_NBYTES+3	 # +3 for 8 units/iter
437
	beqz	t0, .Lcleanup_both_aligned # len < 8*NBYTES
438 439 440 441
	 nop
	SUB	len, 8*NBYTES		# subtract here for bgez loop
	.align	4
1:
442 443 444 445 446 447 448 449
EXC(	LOAD	t0, UNIT(0)(src),	.Ll_exc)
EXC(	LOAD	t1, UNIT(1)(src),	.Ll_exc_copy)
EXC(	LOAD	t2, UNIT(2)(src),	.Ll_exc_copy)
EXC(	LOAD	t3, UNIT(3)(src),	.Ll_exc_copy)
EXC(	LOAD	t4, UNIT(4)(src),	.Ll_exc_copy)
EXC(	LOAD	t5, UNIT(5)(src),	.Ll_exc_copy)
EXC(	LOAD	t6, UNIT(6)(src),	.Ll_exc_copy)
EXC(	LOAD	t7, UNIT(7)(src),	.Ll_exc_copy)
450 451
	SUB	len, len, 8*NBYTES
	ADD	src, src, 8*NBYTES
452
EXC(	STORE	t0, UNIT(0)(dst),	.Ls_exc)
453
	ADDC(sum, t0)
454
EXC(	STORE	t1, UNIT(1)(dst),	.Ls_exc)
455
	ADDC(sum, t1)
456
EXC(	STORE	t2, UNIT(2)(dst),	.Ls_exc)
457
	ADDC(sum, t2)
458
EXC(	STORE	t3, UNIT(3)(dst),	.Ls_exc)
459
	ADDC(sum, t3)
460
EXC(	STORE	t4, UNIT(4)(dst),	.Ls_exc)
461
	ADDC(sum, t4)
462
EXC(	STORE	t5, UNIT(5)(dst),	.Ls_exc)
463
	ADDC(sum, t5)
464
EXC(	STORE	t6, UNIT(6)(dst),	.Ls_exc)
465
	ADDC(sum, t6)
466
EXC(	STORE	t7, UNIT(7)(dst),	.Ls_exc)
467
	ADDC(sum, t7)
468 469
	.set	reorder				/* DADDI_WAR */
	ADD	dst, dst, 8*NBYTES
470
	bgez	len, 1b
471
	.set	noreorder
472 473 474 475 476
	ADD	len, 8*NBYTES		# revert len (see above)

	/*
	 * len == the number of bytes left to copy < 8*NBYTES
	 */
477
.Lcleanup_both_aligned:
478
#define rem t7
479
	beqz	len, .Ldone
480
	 sltu	t0, len, 4*NBYTES
481
	bnez	t0, .Lless_than_4units
482 483 484 485
	 and	rem, len, (NBYTES-1)	# rem = len % NBYTES
	/*
	 * len >= 4*NBYTES
	 */
486 487 488 489
EXC(	LOAD	t0, UNIT(0)(src),	.Ll_exc)
EXC(	LOAD	t1, UNIT(1)(src),	.Ll_exc_copy)
EXC(	LOAD	t2, UNIT(2)(src),	.Ll_exc_copy)
EXC(	LOAD	t3, UNIT(3)(src),	.Ll_exc_copy)
490 491
	SUB	len, len, 4*NBYTES
	ADD	src, src, 4*NBYTES
492
EXC(	STORE	t0, UNIT(0)(dst),	.Ls_exc)
493
	ADDC(sum, t0)
494
EXC(	STORE	t1, UNIT(1)(dst),	.Ls_exc)
495
	ADDC(sum, t1)
496
EXC(	STORE	t2, UNIT(2)(dst),	.Ls_exc)
497
	ADDC(sum, t2)
498
EXC(	STORE	t3, UNIT(3)(dst),	.Ls_exc)
499
	ADDC(sum, t3)
500 501
	.set	reorder				/* DADDI_WAR */
	ADD	dst, dst, 4*NBYTES
502
	beqz	len, .Ldone
503
	.set	noreorder
504
.Lless_than_4units:
505 506 507
	/*
	 * rem = len % NBYTES
	 */
508
	beq	rem, len, .Lcopy_bytes
509 510
	 nop
1:
511
EXC(	LOAD	t0, 0(src),		.Ll_exc)
512 513
	ADD	src, src, NBYTES
	SUB	len, len, NBYTES
514
EXC(	STORE	t0, 0(dst),		.Ls_exc)
515
	ADDC(sum, t0)
516 517
	.set	reorder				/* DADDI_WAR */
	ADD	dst, dst, NBYTES
518
	bne	rem, len, 1b
519
	.set	noreorder
520 521 522 523

	/*
	 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
	 * A loop would do only a byte at a time with possible branch
R
Ralf Baechle 已提交
524
	 * mispredicts.	 Can't do an explicit LOAD dst,mask,or,STORE
525 526 527 528 529 530 531 532
	 * because can't assume read-access to dst.  Instead, use
	 * STREST dst, which doesn't require read access to dst.
	 *
	 * This code should perform better than a simple loop on modern,
	 * wide-issue mips processors because the code has fewer branches and
	 * more instruction-level parallelism.
	 */
#define bits t2
533
	beqz	len, .Ldone
534 535 536
	 ADD	t1, dst, len	# t1 is just past last byte of dst
	li	bits, 8*NBYTES
	SLL	rem, len, 3	# rem = number of bits to keep
537
EXC(	LOAD	t0, 0(src),		.Ll_exc)
R
Ralf Baechle 已提交
538
	SUB	bits, bits, rem # bits = number of bits to discard
539
	SHIFT_DISCARD t0, t0, bits
540
EXC(	STREST	t0, -1(t1),		.Ls_exc)
541 542 543
	SHIFT_DISCARD_REVERT t0, t0, bits
	.set reorder
	ADDC(sum, t0)
544
	b	.Ldone
545
	.set noreorder
546
.Ldst_unaligned:
547 548 549 550 551 552 553 554 555 556
	/*
	 * dst is unaligned
	 * t0 = src & ADDRMASK
	 * t1 = dst & ADDRMASK; T1 > 0
	 * len >= NBYTES
	 *
	 * Copy enough bytes to align dst
	 * Set match = (src and dst have same alignment)
	 */
#define match rem
R
Ralf Baechle 已提交
557
EXC(	LDFIRST t3, FIRST(0)(src),	.Ll_exc)
558
	ADD	t2, zero, NBYTES
559
EXC(	LDREST	t3, REST(0)(src),	.Ll_exc_copy)
560 561
	SUB	t2, t2, t1	# t2 = number of bytes copied
	xor	match, t0, t1
562
EXC(	STFIRST t3, FIRST(0)(dst),	.Ls_exc)
563 564 565 566
	SLL	t4, t1, 3		# t4 = number of bits to discard
	SHIFT_DISCARD t3, t3, t4
	/* no SHIFT_DISCARD_REVERT to handle odd buffer properly */
	ADDC(sum, t3)
567
	beq	len, t2, .Ldone
568 569
	 SUB	len, len, t2
	ADD	dst, dst, t2
570
	beqz	match, .Lboth_aligned
571 572
	 ADD	src, src, t2

573
.Lsrc_unaligned_dst_aligned:
R
Ralf Baechle 已提交
574
	SRL	t0, len, LOG_NBYTES+2	 # +2 for 4 units/iter
575
	beqz	t0, .Lcleanup_src_unaligned
R
Ralf Baechle 已提交
576
	 and	rem, len, (4*NBYTES-1)	 # rem = len % 4*NBYTES
577 578 579 580 581 582 583
1:
/*
 * Avoid consecutive LD*'s to the same register since some mips
 * implementations can't issue them in the same cycle.
 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 * are to the same unit (unless src is aligned, but it's not).
 */
R
Ralf Baechle 已提交
584 585 586
EXC(	LDFIRST t0, FIRST(0)(src),	.Ll_exc)
EXC(	LDFIRST t1, FIRST(1)(src),	.Ll_exc_copy)
	SUB	len, len, 4*NBYTES
587 588
EXC(	LDREST	t0, REST(0)(src),	.Ll_exc_copy)
EXC(	LDREST	t1, REST(1)(src),	.Ll_exc_copy)
R
Ralf Baechle 已提交
589 590
EXC(	LDFIRST t2, FIRST(2)(src),	.Ll_exc_copy)
EXC(	LDFIRST t3, FIRST(3)(src),	.Ll_exc_copy)
591 592
EXC(	LDREST	t2, REST(2)(src),	.Ll_exc_copy)
EXC(	LDREST	t3, REST(3)(src),	.Ll_exc_copy)
593 594 595 596
	ADD	src, src, 4*NBYTES
#ifdef CONFIG_CPU_SB1
	nop				# improves slotting
#endif
597
EXC(	STORE	t0, UNIT(0)(dst),	.Ls_exc)
598
	ADDC(sum, t0)
599
EXC(	STORE	t1, UNIT(1)(dst),	.Ls_exc)
600
	ADDC(sum, t1)
601
EXC(	STORE	t2, UNIT(2)(dst),	.Ls_exc)
602
	ADDC(sum, t2)
603
EXC(	STORE	t3, UNIT(3)(dst),	.Ls_exc)
604
	ADDC(sum, t3)
605 606
	.set	reorder				/* DADDI_WAR */
	ADD	dst, dst, 4*NBYTES
607
	bne	len, rem, 1b
608
	.set	noreorder
609

610 611
.Lcleanup_src_unaligned:
	beqz	len, .Ldone
612
	 and	rem, len, NBYTES-1  # rem = len % NBYTES
613
	beq	rem, len, .Lcopy_bytes
614 615
	 nop
1:
616 617
EXC(	LDFIRST t0, FIRST(0)(src),	.Ll_exc)
EXC(	LDREST	t0, REST(0)(src),	.Ll_exc_copy)
618 619
	ADD	src, src, NBYTES
	SUB	len, len, NBYTES
620
EXC(	STORE	t0, 0(dst),		.Ls_exc)
621
	ADDC(sum, t0)
622 623
	.set	reorder				/* DADDI_WAR */
	ADD	dst, dst, NBYTES
624
	bne	len, rem, 1b
625
	.set	noreorder
626

627 628
.Lcopy_bytes_checklen:
	beqz	len, .Ldone
629
	 nop
630
.Lcopy_bytes:
631 632 633 634 635 636 637 638 639
	/* 0 < len < NBYTES  */
#ifdef CONFIG_CPU_LITTLE_ENDIAN
#define SHIFT_START 0
#define SHIFT_INC 8
#else
#define SHIFT_START 8*(NBYTES-1)
#define SHIFT_INC -8
#endif
	move	t2, zero	# partial word
R
Ralf Baechle 已提交
640
	li	t3, SHIFT_START # shift
641
/* use .Ll_exc_copy here to return correct sum on fault */
642
#define COPY_BYTE(N)			\
643
EXC(	lbu	t0, N(src), .Ll_exc_copy);	\
644
	SUB	len, len, 1;		\
645
EXC(	sb	t0, N(dst), .Ls_exc);	\
646 647
	SLLV	t0, t0, t3;		\
	addu	t3, SHIFT_INC;		\
R
Ralf Baechle 已提交
648
	beqz	len, .Lcopy_bytes_done; \
649 650 651 652 653 654 655 656 657 658
	 or	t2, t0

	COPY_BYTE(0)
	COPY_BYTE(1)
#ifdef USE_DOUBLE
	COPY_BYTE(2)
	COPY_BYTE(3)
	COPY_BYTE(4)
	COPY_BYTE(5)
#endif
659
EXC(	lbu	t0, NBYTES-2(src), .Ll_exc_copy)
660
	SUB	len, len, 1
661
EXC(	sb	t0, NBYTES-2(dst), .Ls_exc)
662 663
	SLLV	t0, t0, t3
	or	t2, t0
664
.Lcopy_bytes_done:
665
	ADDC(sum, t2)
666
.Ldone:
667 668 669 670 671 672 673 674 675
	/* fold checksum */
#ifdef USE_DOUBLE
	dsll32	v1, sum, 0
	daddu	sum, v1
	sltu	v1, sum, v1
	dsra32	sum, sum, 0
	addu	sum, v1
#endif

676
#ifdef CONFIG_CPU_MIPSR2
677 678 679 680 681 682 683 684
	wsbh	v1, sum
	movn	sum, v1, odd
#else
	beqz	odd, 1f			/* odd buffer alignment? */
	 lui	v1, 0x00ff
	addu	v1, 0x00ff
	and	t0, sum, v1
	sll	t0, t0, 8
685
	srl	sum, sum, 8
686 687
	and	sum, sum, v1
	or	sum, sum, t0
688
1:
689
#endif
690
	.set reorder
691
	ADDC32(sum, psum)
692 693 694
	jr	ra
	.set noreorder

695
.Ll_exc_copy:
696 697 698 699 700 701 702 703 704 705 706 707 708 709
	/*
	 * Copy bytes from src until faulting load address (or until a
	 * lb faults)
	 *
	 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
	 * may be more than a byte beyond the last address.
	 * Hence, the lb below may get an exception.
	 *
	 * Assumes src < THREAD_BUADDR($28)
	 */
	LOAD	t0, TI_TASK($28)
	 li	t2, SHIFT_START
	LOAD	t0, THREAD_BUADDR(t0)
1:
710
EXC(	lbu	t1, 0(src),	.Ll_exc)
711 712 713 714 715
	ADD	src, src, 1
	sb	t1, 0(dst)	# can't fault -- we're copy_from_user
	SLLV	t1, t1, t2
	addu	t2, SHIFT_INC
	ADDC(sum, t1)
716 717
	.set	reorder				/* DADDI_WAR */
	ADD	dst, dst, 1
718
	bne	src, t0, 1b
719
	.set	noreorder
720
.Ll_exc:
721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736
	LOAD	t0, TI_TASK($28)
	 nop
	LOAD	t0, THREAD_BUADDR(t0)	# t0 is just past last good address
	 nop
	SUB	len, AT, t0		# len number of uncopied bytes
	/*
	 * Here's where we rely on src and dst being incremented in tandem,
	 *   See (3) above.
	 * dst += (fault addr - src) to put dst at first byte to clear
	 */
	ADD	dst, t0			# compute start address in a1
	SUB	dst, src
	/*
	 * Clear len bytes starting at dst.  Can't call __bzero because it
	 * might modify len.  An inefficient loop for these rare times...
	 */
737 738
	.set	reorder				/* DADDI_WAR */
	SUB	src, len, 1
739
	beqz	len, .Ldone
740
	.set	noreorder
741 742
1:	sb	zero, 0(dst)
	ADD	dst, dst, 1
743 744 745
	.set	push
	.set	noat
#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
746 747
	bnez	src, 1b
	 SUB	src, src, 1
748 749 750 751 752
#else
	li	v1, 1
	bnez	src, 1b
	 SUB	src, src, v1
#endif
753
	li	v1, -EFAULT
754
	b	.Ldone
755 756
	 sw	v1, (errptr)

757
.Ls_exc:
758 759 760 761
	li	v0, -1 /* invalid checksum */
	li	v1, -EFAULT
	jr	ra
	 sw	v1, (errptr)
762
	.set	pop
763
	END(__csum_partial_copy_kernel)