memcpy.S 14.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11
/*
 * This file is subject to the terms and conditions of the GNU General Public
 * License.  See the file "COPYING" in the main directory of this archive
 * for more details.
 *
 * Unified implementation of memcpy, memmove and the __copy_user backend.
 *
 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
 * Copyright (C) 2002 Broadcom, Inc.
 *   memcpy/copy_user author: Mark Vandevoorde
12
 * Copyright (C) 2007  Maciej W. Rozycki
13
 * Copyright (C) 2014 Imagination Technologies Ltd.
L
Linus Torvalds 已提交
14 15 16
 *
 * Mnemonic names for arguments to memcpy/__copy_user
 */
17 18 19 20 21 22 23 24

/*
 * Hack to resolve longstanding prefetch issue
 *
 * Prefetching may be fatal on some systems if we're prefetching beyond the
 * end of memory on some systems.  It's also a seriously bad idea on non
 * dma-coherent systems.
 */
25
#ifdef CONFIG_DMA_NONCOHERENT
26 27 28 29 30 31
#undef CONFIG_CPU_HAS_PREFETCH
#endif
#ifdef CONFIG_MIPS_MALTA
#undef CONFIG_CPU_HAS_PREFETCH
#endif

L
Linus Torvalds 已提交
32
#include <asm/asm.h>
33
#include <asm/asm-offsets.h>
L
Linus Torvalds 已提交
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
#include <asm/regdef.h>

#define dst a0
#define src a1
#define len a2

/*
 * Spec
 *
 * memcpy copies len bytes from src to dst and sets v0 to dst.
 * It assumes that
 *   - src and dst don't overlap
 *   - src is readable
 *   - dst is writable
 * memcpy uses the standard calling convention
 *
 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
 * the number of uncopied bytes due to an exception caused by a read or write.
 * __copy_user assumes that src and dst don't overlap, and that the call is
 * implementing one of the following:
 *   copy_to_user
 *     - src is readable  (no exceptions when reading src)
 *   copy_from_user
 *     - dst is writable  (no exceptions when writing dst)
 * __copy_user uses a non-standard calling convention; see
 * include/asm-mips/uaccess.h
 *
 * When an exception happens on a load, the handler must
 # ensure that all of the destination buffer is overwritten to prevent
 * leaking information to user mode programs.
 */

/*
 * Implementation
 */

/*
 * The exception handler for loads requires that:
 *  1- AT contain the address of the byte just past the end of the source
 *     of the copy,
 *  2- src_entry <= src < AT, and
 *  3- (dst - src) == (dst_entry - src_entry),
 * The _entry suffix denotes values when __copy_user was called.
 *
 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
 * (2) is met by incrementing src by the number of bytes copied
 * (3) is met by not doing loads between a pair of increments of dst and src
 *
 * The exception handlers for stores adjust len (if necessary) and return.
 * These handlers do not need to overwrite any data.
 *
 * For __rmemcpy and memmove an exception is always a kernel bug, therefore
 * they're not protected.
 */

89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
/* Instruction type */
#define LD_INSN 1
#define ST_INSN 2

/*
 * Wrapper to add an entry in the exception table
 * in case the insn causes a memory exception.
 * Arguments:
 * insn    : Load/store instruction
 * type    : Instruction type
 * reg     : Register
 * addr    : Address
 * handler : Exception handler
 */
#define EXC(insn, type, reg, addr, handler)	\
9:	insn reg, addr;				\
L
Linus Torvalds 已提交
105 106 107 108 109 110 111
	.section __ex_table,"a";		\
	PTR	9b, handler;			\
	.previous

/*
 * Only on the 64-bit kernel we can made use of 64-bit registers.
 */
112
#ifdef CONFIG_64BIT
L
Linus Torvalds 已提交
113 114 115 116 117
#define USE_DOUBLE
#endif

#ifdef USE_DOUBLE

118 119 120 121 122 123 124
#define LOADK ld /* No exception */
#define LOAD(reg, addr, handler)	EXC(ld, LD_INSN, reg, addr, handler)
#define LOADL(reg, addr, handler)	EXC(ldl, LD_INSN, reg, addr, handler)
#define LOADR(reg, addr, handler)	EXC(ldr, LD_INSN, reg, addr, handler)
#define STOREL(reg, addr, handler)	EXC(sdl, ST_INSN, reg, addr, handler)
#define STORER(reg, addr, handler)	EXC(sdr, ST_INSN, reg, addr, handler)
#define STORE(reg, addr, handler)	EXC(sd, ST_INSN, reg, addr, handler)
L
Linus Torvalds 已提交
125 126 127 128 129 130 131 132 133 134
#define ADD    daddu
#define SUB    dsubu
#define SRL    dsrl
#define SRA    dsra
#define SLL    dsll
#define SLLV   dsllv
#define SRLV   dsrlv
#define NBYTES 8
#define LOG_NBYTES 3

135
/*
L
Linus Torvalds 已提交
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
 * As we are sharing code base with the mips32 tree (which use the o32 ABI
 * register definitions). We need to redefine the register definitions from
 * the n64 ABI register naming to the o32 ABI register naming.
 */
#undef t0
#undef t1
#undef t2
#undef t3
#define t0	$8
#define t1	$9
#define t2	$10
#define t3	$11
#define t4	$12
#define t5	$13
#define t6	$14
#define t7	$15
152

L
Linus Torvalds 已提交
153 154
#else

155 156 157 158 159 160 161
#define LOADK lw /* No exception */
#define LOAD(reg, addr, handler)	EXC(lw, LD_INSN, reg, addr, handler)
#define LOADL(reg, addr, handler)	EXC(lwl, LD_INSN, reg, addr, handler)
#define LOADR(reg, addr, handler)	EXC(lwr, LD_INSN, reg, addr, handler)
#define STOREL(reg, addr, handler)	EXC(swl, ST_INSN, reg, addr, handler)
#define STORER(reg, addr, handler)	EXC(swr, ST_INSN, reg, addr, handler)
#define STORE(reg, addr, handler)	EXC(sw, ST_INSN, reg, addr, handler)
L
Linus Torvalds 已提交
162 163 164 165 166 167 168 169 170 171 172 173
#define ADD    addu
#define SUB    subu
#define SRL    srl
#define SLL    sll
#define SRA    sra
#define SLLV   sllv
#define SRLV   srlv
#define NBYTES 4
#define LOG_NBYTES 2

#endif /* USE_DOUBLE */

174 175 176
#define LOADB(reg, addr, handler)	EXC(lb, LD_INSN, reg, addr, handler)
#define STOREB(reg, addr, handler)	EXC(sb, ST_INSN, reg, addr, handler)

L
Linus Torvalds 已提交
177 178
#ifdef CONFIG_CPU_LITTLE_ENDIAN
#define LDFIRST LOADR
R
Ralf Baechle 已提交
179
#define LDREST	LOADL
L
Linus Torvalds 已提交
180
#define STFIRST STORER
R
Ralf Baechle 已提交
181
#define STREST	STOREL
L
Linus Torvalds 已提交
182 183 184
#define SHIFT_DISCARD SLLV
#else
#define LDFIRST LOADL
R
Ralf Baechle 已提交
185
#define LDREST	LOADR
L
Linus Torvalds 已提交
186
#define STFIRST STOREL
R
Ralf Baechle 已提交
187
#define STREST	STORER
L
Linus Torvalds 已提交
188 189 190 191 192 193 194 195 196 197 198
#define SHIFT_DISCARD SRLV
#endif

#define FIRST(unit) ((unit)*NBYTES)
#define REST(unit)  (FIRST(unit)+NBYTES-1)
#define UNIT(unit)  FIRST(unit)

#define ADDRMASK (NBYTES-1)

	.text
	.set	noreorder
199
#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
L
Linus Torvalds 已提交
200
	.set	noat
201 202 203
#else
	.set	at=v1
#endif
L
Linus Torvalds 已提交
204

205 206 207 208 209 210 211 212
/*
 * t6 is used as a flag to note inatomic mode.
 */
LEAF(__copy_user_inatomic)
	b	__copy_user_common
	 li	t6, 1
	END(__copy_user_inatomic)

L
Linus Torvalds 已提交
213 214 215 216 217 218 219 220 221
/*
 * A combined memcpy/__copy_user
 * __copy_user sets len to 0 for success; else to an upper bound of
 * the number of uncopied bytes.
 * memcpy sets v0 to dst.
 */
	.align	5
LEAF(memcpy)					/* a0=dst a1=src a2=len */
	move	v0, dst				/* return value */
222
.L__memcpy:
L
Linus Torvalds 已提交
223
FEXPORT(__copy_user)
224 225
	li	t6, 0	/* not inatomic */
__copy_user_common:
L
Linus Torvalds 已提交
226 227 228 229 230 231
	/*
	 * Note: dst & src may be unaligned, len may be 0
	 * Temps
	 */
#define rem t8

232
	R10KCBARRIER(0(ra))
L
Linus Torvalds 已提交
233 234 235 236 237 238 239 240 241 242 243 244 245
	/*
	 * The "issue break"s below are very approximate.
	 * Issue delays for dcache fills will perturb the schedule, as will
	 * load queue full replay traps, etc.
	 *
	 * If len < NBYTES use byte operations.
	 */
	PREF(	0, 0(src) )
	PREF(	1, 0(dst) )
	sltu	t2, len, NBYTES
	and	t1, dst, ADDRMASK
	PREF(	0, 1*32(src) )
	PREF(	1, 1*32(dst) )
246
	bnez	t2, .Lcopy_bytes_checklen
L
Linus Torvalds 已提交
247 248 249
	 and	t0, src, ADDRMASK
	PREF(	0, 2*32(src) )
	PREF(	1, 2*32(dst) )
250
	bnez	t1, .Ldst_unaligned
L
Linus Torvalds 已提交
251
	 nop
252
	bnez	t0, .Lsrc_unaligned_dst_aligned
L
Linus Torvalds 已提交
253 254 255 256
	/*
	 * use delay slot for fall-through
	 * src and dst are aligned; need to compute rem
	 */
257
.Lboth_aligned:
R
Ralf Baechle 已提交
258
	 SRL	t0, len, LOG_NBYTES+3	 # +3 for 8 units/iter
259
	beqz	t0, .Lcleanup_both_aligned # len < 8*NBYTES
L
Linus Torvalds 已提交
260 261 262 263 264
	 and	rem, len, (8*NBYTES-1)	 # rem = len % (8*NBYTES)
	PREF(	0, 3*32(src) )
	PREF(	1, 3*32(dst) )
	.align	4
1:
265
	R10KCBARRIER(0(ra))
266 267 268 269
	LOAD(t0, UNIT(0)(src), .Ll_exc)
	LOAD(t1, UNIT(1)(src), .Ll_exc_copy)
	LOAD(t2, UNIT(2)(src), .Ll_exc_copy)
	LOAD(t3, UNIT(3)(src), .Ll_exc_copy)
L
Linus Torvalds 已提交
270
	SUB	len, len, 8*NBYTES
271 272 273 274 275 276
	LOAD(t4, UNIT(4)(src), .Ll_exc_copy)
	LOAD(t7, UNIT(5)(src), .Ll_exc_copy)
	STORE(t0, UNIT(0)(dst),	.Ls_exc_p8u)
	STORE(t1, UNIT(1)(dst),	.Ls_exc_p7u)
	LOAD(t0, UNIT(6)(src), .Ll_exc_copy)
	LOAD(t1, UNIT(7)(src), .Ll_exc_copy)
L
Linus Torvalds 已提交
277 278
	ADD	src, src, 8*NBYTES
	ADD	dst, dst, 8*NBYTES
279 280 281 282 283 284
	STORE(t2, UNIT(-6)(dst), .Ls_exc_p6u)
	STORE(t3, UNIT(-5)(dst), .Ls_exc_p5u)
	STORE(t4, UNIT(-4)(dst), .Ls_exc_p4u)
	STORE(t7, UNIT(-3)(dst), .Ls_exc_p3u)
	STORE(t0, UNIT(-2)(dst), .Ls_exc_p2u)
	STORE(t1, UNIT(-1)(dst), .Ls_exc_p1u)
L
Linus Torvalds 已提交
285 286 287 288 289 290 291 292
	PREF(	0, 8*32(src) )
	PREF(	1, 8*32(dst) )
	bne	len, rem, 1b
	 nop

	/*
	 * len == rem == the number of bytes left to copy < 8*NBYTES
	 */
293 294
.Lcleanup_both_aligned:
	beqz	len, .Ldone
L
Linus Torvalds 已提交
295
	 sltu	t0, len, 4*NBYTES
296
	bnez	t0, .Lless_than_4units
L
Linus Torvalds 已提交
297 298 299 300
	 and	rem, len, (NBYTES-1)	# rem = len % NBYTES
	/*
	 * len >= 4*NBYTES
	 */
301 302 303 304
	LOAD( t0, UNIT(0)(src),	.Ll_exc)
	LOAD( t1, UNIT(1)(src),	.Ll_exc_copy)
	LOAD( t2, UNIT(2)(src),	.Ll_exc_copy)
	LOAD( t3, UNIT(3)(src),	.Ll_exc_copy)
L
Linus Torvalds 已提交
305 306
	SUB	len, len, 4*NBYTES
	ADD	src, src, 4*NBYTES
307
	R10KCBARRIER(0(ra))
308 309 310 311
	STORE(t0, UNIT(0)(dst),	.Ls_exc_p4u)
	STORE(t1, UNIT(1)(dst),	.Ls_exc_p3u)
	STORE(t2, UNIT(2)(dst),	.Ls_exc_p2u)
	STORE(t3, UNIT(3)(dst),	.Ls_exc_p1u)
312 313
	.set	reorder				/* DADDI_WAR */
	ADD	dst, dst, 4*NBYTES
314
	beqz	len, .Ldone
315
	.set	noreorder
316
.Lless_than_4units:
L
Linus Torvalds 已提交
317 318 319
	/*
	 * rem = len % NBYTES
	 */
320
	beq	rem, len, .Lcopy_bytes
L
Linus Torvalds 已提交
321 322
	 nop
1:
323
	R10KCBARRIER(0(ra))
324
	LOAD(t0, 0(src), .Ll_exc)
L
Linus Torvalds 已提交
325 326
	ADD	src, src, NBYTES
	SUB	len, len, NBYTES
327
	STORE(t0, 0(dst), .Ls_exc_p1u)
328 329
	.set	reorder				/* DADDI_WAR */
	ADD	dst, dst, NBYTES
L
Linus Torvalds 已提交
330
	bne	rem, len, 1b
331
	.set	noreorder
L
Linus Torvalds 已提交
332 333 334 335

	/*
	 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
	 * A loop would do only a byte at a time with possible branch
R
Ralf Baechle 已提交
336
	 * mispredicts.	 Can't do an explicit LOAD dst,mask,or,STORE
L
Linus Torvalds 已提交
337 338 339 340 341 342 343 344
	 * because can't assume read-access to dst.  Instead, use
	 * STREST dst, which doesn't require read access to dst.
	 *
	 * This code should perform better than a simple loop on modern,
	 * wide-issue mips processors because the code has fewer branches and
	 * more instruction-level parallelism.
	 */
#define bits t2
345
	beqz	len, .Ldone
L
Linus Torvalds 已提交
346 347 348
	 ADD	t1, dst, len	# t1 is just past last byte of dst
	li	bits, 8*NBYTES
	SLL	rem, len, 3	# rem = number of bits to keep
349
	LOAD(t0, 0(src), .Ll_exc)
R
Ralf Baechle 已提交
350
	SUB	bits, bits, rem # bits = number of bits to discard
L
Linus Torvalds 已提交
351
	SHIFT_DISCARD t0, t0, bits
352
	STREST(t0, -1(t1), .Ls_exc)
L
Linus Torvalds 已提交
353 354
	jr	ra
	 move	len, zero
355
.Ldst_unaligned:
L
Linus Torvalds 已提交
356 357 358 359 360 361 362 363 364 365
	/*
	 * dst is unaligned
	 * t0 = src & ADDRMASK
	 * t1 = dst & ADDRMASK; T1 > 0
	 * len >= NBYTES
	 *
	 * Copy enough bytes to align dst
	 * Set match = (src and dst have same alignment)
	 */
#define match rem
366
	LDFIRST(t3, FIRST(0)(src), .Ll_exc)
L
Linus Torvalds 已提交
367
	ADD	t2, zero, NBYTES
368
	LDREST(t3, REST(0)(src), .Ll_exc_copy)
L
Linus Torvalds 已提交
369 370
	SUB	t2, t2, t1	# t2 = number of bytes copied
	xor	match, t0, t1
371
	R10KCBARRIER(0(ra))
372
	STFIRST(t3, FIRST(0)(dst), .Ls_exc)
373
	beq	len, t2, .Ldone
L
Linus Torvalds 已提交
374 375
	 SUB	len, len, t2
	ADD	dst, dst, t2
376
	beqz	match, .Lboth_aligned
L
Linus Torvalds 已提交
377 378
	 ADD	src, src, t2

379
.Lsrc_unaligned_dst_aligned:
R
Ralf Baechle 已提交
380
	SRL	t0, len, LOG_NBYTES+2	 # +2 for 4 units/iter
L
Linus Torvalds 已提交
381
	PREF(	0, 3*32(src) )
382
	beqz	t0, .Lcleanup_src_unaligned
R
Ralf Baechle 已提交
383
	 and	rem, len, (4*NBYTES-1)	 # rem = len % 4*NBYTES
L
Linus Torvalds 已提交
384 385 386 387 388 389 390 391
	PREF(	1, 3*32(dst) )
1:
/*
 * Avoid consecutive LD*'s to the same register since some mips
 * implementations can't issue them in the same cycle.
 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 * are to the same unit (unless src is aligned, but it's not).
 */
392
	R10KCBARRIER(0(ra))
393 394
	LDFIRST(t0, FIRST(0)(src), .Ll_exc)
	LDFIRST(t1, FIRST(1)(src), .Ll_exc_copy)
R
Ralf Baechle 已提交
395
	SUB	len, len, 4*NBYTES
396 397 398 399 400 401
	LDREST(t0, REST(0)(src), .Ll_exc_copy)
	LDREST(t1, REST(1)(src), .Ll_exc_copy)
	LDFIRST(t2, FIRST(2)(src), .Ll_exc_copy)
	LDFIRST(t3, FIRST(3)(src), .Ll_exc_copy)
	LDREST(t2, REST(2)(src), .Ll_exc_copy)
	LDREST(t3, REST(3)(src), .Ll_exc_copy)
L
Linus Torvalds 已提交
402 403 404 405 406
	PREF(	0, 9*32(src) )		# 0 is PREF_LOAD  (not streamed)
	ADD	src, src, 4*NBYTES
#ifdef CONFIG_CPU_SB1
	nop				# improves slotting
#endif
407 408 409 410
	STORE(t0, UNIT(0)(dst),	.Ls_exc_p4u)
	STORE(t1, UNIT(1)(dst),	.Ls_exc_p3u)
	STORE(t2, UNIT(2)(dst),	.Ls_exc_p2u)
	STORE(t3, UNIT(3)(dst),	.Ls_exc_p1u)
R
Ralf Baechle 已提交
411
	PREF(	1, 9*32(dst) )		# 1 is PREF_STORE (not streamed)
412 413
	.set	reorder				/* DADDI_WAR */
	ADD	dst, dst, 4*NBYTES
L
Linus Torvalds 已提交
414
	bne	len, rem, 1b
415
	.set	noreorder
L
Linus Torvalds 已提交
416

417 418
.Lcleanup_src_unaligned:
	beqz	len, .Ldone
L
Linus Torvalds 已提交
419
	 and	rem, len, NBYTES-1  # rem = len % NBYTES
420
	beq	rem, len, .Lcopy_bytes
L
Linus Torvalds 已提交
421 422
	 nop
1:
423
	R10KCBARRIER(0(ra))
424 425
	LDFIRST(t0, FIRST(0)(src), .Ll_exc)
	LDREST(t0, REST(0)(src), .Ll_exc_copy)
L
Linus Torvalds 已提交
426 427
	ADD	src, src, NBYTES
	SUB	len, len, NBYTES
428
	STORE(t0, 0(dst), .Ls_exc_p1u)
429 430
	.set	reorder				/* DADDI_WAR */
	ADD	dst, dst, NBYTES
L
Linus Torvalds 已提交
431
	bne	len, rem, 1b
432
	.set	noreorder
L
Linus Torvalds 已提交
433

434 435
.Lcopy_bytes_checklen:
	beqz	len, .Ldone
L
Linus Torvalds 已提交
436
	 nop
437
.Lcopy_bytes:
L
Linus Torvalds 已提交
438
	/* 0 < len < NBYTES  */
439
	R10KCBARRIER(0(ra))
L
Linus Torvalds 已提交
440
#define COPY_BYTE(N)			\
441
	LOADB(t0, N(src), .Ll_exc);	\
L
Linus Torvalds 已提交
442
	SUB	len, len, 1;		\
443
	beqz	len, .Ldone;		\
444
	STOREB(t0, N(dst), .Ls_exc_p1)
L
Linus Torvalds 已提交
445 446 447 448 449 450 451 452 453

	COPY_BYTE(0)
	COPY_BYTE(1)
#ifdef USE_DOUBLE
	COPY_BYTE(2)
	COPY_BYTE(3)
	COPY_BYTE(4)
	COPY_BYTE(5)
#endif
454
	LOADB(t0, NBYTES-2(src), .Ll_exc)
L
Linus Torvalds 已提交
455 456
	SUB	len, len, 1
	jr	ra
457
	STOREB(t0, NBYTES-2(dst), .Ls_exc_p1)
458
.Ldone:
L
Linus Torvalds 已提交
459 460 461 462
	jr	ra
	 nop
	END(memcpy)

463
.Ll_exc_copy:
L
Linus Torvalds 已提交
464 465 466 467 468 469 470 471 472 473
	/*
	 * Copy bytes from src until faulting load address (or until a
	 * lb faults)
	 *
	 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
	 * may be more than a byte beyond the last address.
	 * Hence, the lb below may get an exception.
	 *
	 * Assumes src < THREAD_BUADDR($28)
	 */
474
	LOADK	t0, TI_TASK($28)
L
Linus Torvalds 已提交
475
	 nop
476
	LOADK	t0, THREAD_BUADDR(t0)
L
Linus Torvalds 已提交
477
1:
478
	LOADB(t1, 0(src), .Ll_exc)
L
Linus Torvalds 已提交
479 480
	ADD	src, src, 1
	sb	t1, 0(dst)	# can't fault -- we're copy_from_user
481 482
	.set	reorder				/* DADDI_WAR */
	ADD	dst, dst, 1
L
Linus Torvalds 已提交
483
	bne	src, t0, 1b
484
	.set	noreorder
485
.Ll_exc:
486
	LOADK	t0, TI_TASK($28)
L
Linus Torvalds 已提交
487
	 nop
488
	LOADK	t0, THREAD_BUADDR(t0)	# t0 is just past last good address
L
Linus Torvalds 已提交
489 490
	 nop
	SUB	len, AT, t0		# len number of uncopied bytes
491
	bnez	t6, .Ldone	/* Skip the zeroing part if inatomic */
L
Linus Torvalds 已提交
492 493 494 495 496 497 498 499 500 501 502
	/*
	 * Here's where we rely on src and dst being incremented in tandem,
	 *   See (3) above.
	 * dst += (fault addr - src) to put dst at first byte to clear
	 */
	ADD	dst, t0			# compute start address in a1
	SUB	dst, src
	/*
	 * Clear len bytes starting at dst.  Can't call __bzero because it
	 * might modify len.  An inefficient loop for these rare times...
	 */
503 504
	.set	reorder				/* DADDI_WAR */
	SUB	src, len, 1
505
	beqz	len, .Ldone
506
	.set	noreorder
L
Linus Torvalds 已提交
507 508
1:	sb	zero, 0(dst)
	ADD	dst, dst, 1
509
#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
L
Linus Torvalds 已提交
510 511
	bnez	src, 1b
	 SUB	src, src, 1
512 513 514 515 516 517 518 519
#else
	.set	push
	.set	noat
	li	v1, 1
	bnez	src, 1b
	 SUB	src, src, v1
	.set	pop
#endif
L
Linus Torvalds 已提交
520 521 522 523
	jr	ra
	 nop


524
#define SEXC(n)							\
R
Ralf Baechle 已提交
525
	.set	reorder;			/* DADDI_WAR */ \
526
.Ls_exc_p ## n ## u:						\
527 528 529
	ADD	len, len, n*NBYTES;				\
	jr	ra;						\
	.set	noreorder
L
Linus Torvalds 已提交
530 531 532 533 534 535 536 537 538 539

SEXC(8)
SEXC(7)
SEXC(6)
SEXC(5)
SEXC(4)
SEXC(3)
SEXC(2)
SEXC(1)

540
.Ls_exc_p1:
541 542
	.set	reorder				/* DADDI_WAR */
	ADD	len, len, 1
L
Linus Torvalds 已提交
543
	jr	ra
544
	.set	noreorder
545
.Ls_exc:
L
Linus Torvalds 已提交
546 547 548 549 550 551 552 553 554 555
	jr	ra
	 nop

	.align	5
LEAF(memmove)
	ADD	t0, a0, a2
	ADD	t1, a1, a2
	sltu	t0, a1, t0			# dst + len <= src -> memcpy
	sltu	t1, a0, t1			# dst >= src + len -> memcpy
	and	t0, t1
556
	beqz	t0, .L__memcpy
L
Linus Torvalds 已提交
557
	 move	v0, a0				/* return value */
558
	beqz	a2, .Lr_out
L
Linus Torvalds 已提交
559 560 561 562 563
	END(memmove)

	/* fall through to __rmemcpy */
LEAF(__rmemcpy)					/* a0=dst a1=src a2=len */
	 sltu	t0, a1, a0
564
	beqz	t0, .Lr_end_bytes_up		# src >= dst
L
Linus Torvalds 已提交
565 566 567 568
	 nop
	ADD	a0, a2				# dst = dst + len
	ADD	a1, a2				# src = src + len

569
.Lr_end_bytes:
570
	R10KCBARRIER(0(ra))
L
Linus Torvalds 已提交
571 572 573 574
	lb	t0, -1(a1)
	SUB	a2, a2, 0x1
	sb	t0, -1(a0)
	SUB	a1, a1, 0x1
575 576
	.set	reorder				/* DADDI_WAR */
	SUB	a0, a0, 0x1
577
	bnez	a2, .Lr_end_bytes
578
	.set	noreorder
L
Linus Torvalds 已提交
579

580
.Lr_out:
L
Linus Torvalds 已提交
581 582 583
	jr	ra
	 move	a2, zero

584
.Lr_end_bytes_up:
585
	R10KCBARRIER(0(ra))
L
Linus Torvalds 已提交
586 587 588 589
	lb	t0, (a1)
	SUB	a2, a2, 0x1
	sb	t0, (a0)
	ADD	a1, a1, 0x1
590 591
	.set	reorder				/* DADDI_WAR */
	ADD	a0, a0, 0x1
592
	bnez	a2, .Lr_end_bytes_up
593
	.set	noreorder
L
Linus Torvalds 已提交
594 595 596 597

	jr	ra
	 move	a2, zero
	END(__rmemcpy)