NGmemcpy.S 9.4 KB
Newer Older
1 2
/* NGmemcpy.S: Niagara optimized memcpy.
 *
3
 * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net)
4 5 6 7
 */

#ifdef __KERNEL__
#include <asm/asi.h>
8
#include <asm/thread_info.h>
9
#define GLOBAL_SPARE	%g7
10 11 12
#define RESTORE_ASI(TMP)	\
	ldub	[%g6 + TI_CURRENT_DS], TMP;  \
	wr	TMP, 0x0, %asi;
13 14
#else
#define GLOBAL_SPARE	%g5
15 16
#define RESTORE_ASI(TMP)	\
	wr	%g0, ASI_PNF, %asi
17 18
#endif

19 20 21 22 23 24
#ifdef __sparc_v9__
#define SAVE_AMOUNT	128
#else
#define SAVE_AMOUNT	64
#endif

25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
#ifndef STORE_ASI
#define STORE_ASI	ASI_BLK_INIT_QUAD_LDD_P
#endif

#ifndef EX_LD
#define EX_LD(x)	x
#endif

#ifndef EX_ST
#define EX_ST(x)	x
#endif

#ifndef EX_RETVAL
#define EX_RETVAL(x)	x
#endif

#ifndef LOAD
#ifndef MEMCPY_DEBUG
#define LOAD(type,addr,dest)	type [addr], dest
#else
#define LOAD(type,addr,dest)	type##a [addr] 0x80, dest
#endif
#endif

#ifndef LOAD_TWIN
#define LOAD_TWIN(addr_reg,dest0,dest1)	\
	ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
#endif

#ifndef STORE
#define STORE(type,src,addr)	type src, [addr]
#endif

#ifndef STORE_INIT
59
#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
60
#define STORE_INIT(src,addr)	stxa src, [addr] %asi
61 62 63
#else
#define STORE_INIT(src,addr)	stx src, [addr + 0x00]
#endif
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
#endif

#ifndef FUNC_NAME
#define FUNC_NAME	NGmemcpy
#endif

#ifndef PREAMBLE
#define PREAMBLE
#endif

#ifndef XCC
#define XCC xcc
#endif

	.register	%g2,#scratch
	.register	%g3,#scratch

	.text
	.align		64

	.globl	FUNC_NAME
	.type	FUNC_NAME,#function
86 87 88 89
FUNC_NAME:	/* %i0=dst, %i1=src, %i2=len */
	PREAMBLE
	save		%sp, -SAVE_AMOUNT, %sp
	srlx		%i2, 31, %g2
90 91
	cmp		%g2, 0
	tne		%xcc, 5
92 93
	mov		%i0, %o0
	cmp		%i2, 0
94
	be,pn		%XCC, 85f
95 96
	 or		%o0, %i1, %i3
	cmp		%i2, 16
97
	blu,a,pn	%XCC, 80f
98
	 or		%i3, %i2, %i3
99 100 101 102 103 104 105 106

	/* 2 blocks (128 bytes) is the minimum we can do the block
	 * copy with.  We need to ensure that we'll iterate at least
	 * once in the block copy loop.  At worst we'll need to align
	 * the destination to a 64-byte boundary which can chew up
	 * to (64 - 1) bytes from the length before we perform the
	 * block copy loop.
	 */
107
	cmp		%i2, (2 * 64)
108
	blu,pt		%XCC, 70f
109
	 andcc		%i3, 0x7, %g0
110 111

	/* %o0:	dst
112 113
	 * %i1:	src
	 * %i2:	len  (known to be >= 128)
114
	 *
115
	 * The block copy loops will use %i4/%i5,%g2/%g3 as
116 117 118
	 * temporaries while copying the data.
	 */

119
	LOAD(prefetch, %i1, #one_read)
120 121 122
	wr		%g0, STORE_ASI, %asi

	/* Align destination on 64-byte boundary.  */
123
	andcc		%o0, (64 - 1), %i4
124
	be,pt		%XCC, 2f
125 126 127 128 129
	 sub		%i4, 64, %i4
	sub		%g0, %i4, %i4	! bytes to align dst
	sub		%i2, %i4, %i2
1:	subcc		%i4, 1, %i4
	EX_LD(LOAD(ldub, %i1, %g1))
130
	EX_ST(STORE(stb, %g1, %o0))
131
	add		%i1, 1, %i1
132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
	bne,pt		%XCC, 1b
	add		%o0, 1, %o0

	/* If the source is on a 16-byte boundary we can do
	 * the direct block copy loop.  If it is 8-byte aligned
	 * we can do the 16-byte loads offset by -8 bytes and the
	 * init stores offset by one register.
	 *
	 * If the source is not even 8-byte aligned, we need to do
	 * shifting and masking (basically integer faligndata).
	 *
	 * The careful bit with init stores is that if we store
	 * to any part of the cache line we have to store the whole
	 * cacheline else we can end up with corrupt L2 cache line
	 * contents.  Since the loop works on 64-bytes of 64-byte
	 * aligned store data at a time, this is easy to ensure.
	 */
2:
150 151
	andcc		%i1, (16 - 1), %i4
	andn		%i2, (64 - 1), %g1	! block copy loop iterator
152
	be,pt		%XCC, 50f
153 154 155 156 157
	 sub		%i2, %g1, %i2		! final sub-block copy bytes

	cmp		%i4, 8
	be,pt		%XCC, 10f
	 sub		%i1, %i4, %i1
158 159

	/* Neither 8-byte nor 16-byte aligned, shift and mask.  */
160 161 162 163 164 165 166 167 168 169 170 171
	and		%i4, 0x7, GLOBAL_SPARE
	sll		GLOBAL_SPARE, 3, GLOBAL_SPARE
	mov		64, %i5
	EX_LD(LOAD_TWIN(%i1, %g2, %g3))
	sub		%i5, GLOBAL_SPARE, %i5
	mov		16, %o4
	mov		32, %o5
	mov		48, %o7
	mov		64, %i3

	bg,pn	   	%XCC, 9f
	 nop
172

173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
#define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \
	sllx		WORD1, POST_SHIFT, WORD1; \
	srlx		WORD2, PRE_SHIFT, TMP; \
	sllx		WORD2, POST_SHIFT, WORD2; \
	or		WORD1, TMP, WORD1; \
	srlx		WORD3, PRE_SHIFT, TMP; \
	or		WORD2, TMP, WORD2;

8:	EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3))
	MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
	LOAD(prefetch, %i1 + %i3, #one_read)

	EX_ST(STORE_INIT(%g2, %o0 + 0x00))
	EX_ST(STORE_INIT(%g3, %o0 + 0x08))

	EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3))
	MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)

	EX_ST(STORE_INIT(%o2, %o0 + 0x10))
	EX_ST(STORE_INIT(%o3, %o0 + 0x18))

	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3))
	MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)

	EX_ST(STORE_INIT(%g2, %o0 + 0x20))
	EX_ST(STORE_INIT(%g3, %o0 + 0x28))

	EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3))
	add		%i1, 64, %i1
	MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)

	EX_ST(STORE_INIT(%o2, %o0 + 0x30))
	EX_ST(STORE_INIT(%o3, %o0 + 0x38))

	subcc		%g1, 64, %g1
	bne,pt		%XCC, 8b
209 210
	 add		%o0, 64, %o0

211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
	ba,pt		%XCC, 60f
	 add		%i1, %i4, %i1

9:	EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3))
	MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
	LOAD(prefetch, %i1 + %i3, #one_read)

	EX_ST(STORE_INIT(%g3, %o0 + 0x00))
	EX_ST(STORE_INIT(%o2, %o0 + 0x08))

	EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3))
	MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)

	EX_ST(STORE_INIT(%o3, %o0 + 0x10))
	EX_ST(STORE_INIT(%g2, %o0 + 0x18))

	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3))
	MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)

	EX_ST(STORE_INIT(%g3, %o0 + 0x20))
	EX_ST(STORE_INIT(%o2, %o0 + 0x28))

	EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3))
	add		%i1, 64, %i1
	MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)

	EX_ST(STORE_INIT(%o3, %o0 + 0x30))
	EX_ST(STORE_INIT(%g2, %o0 + 0x38))

	subcc		%g1, 64, %g1
	bne,pt		%XCC, 9b
	 add		%o0, 64, %o0
243 244

	ba,pt		%XCC, 60f
245
	 add		%i1, %i4, %i1
246 247 248 249 250 251

10:	/* Destination is 64-byte aligned, source was only 8-byte
	 * aligned but it has been subtracted by 8 and we perform
	 * one twin load ahead, then add 8 back into source when
	 * we finish the loop.
	 */
252 253 254 255 256 257 258
	EX_LD(LOAD_TWIN(%i1, %o4, %o5))
	mov	16, %o7
	mov	32, %g2
	mov	48, %g3
	mov	64, %o1
1:	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3))
	LOAD(prefetch, %i1 + %o1, #one_read)
259
	EX_ST(STORE_INIT(%o5, %o0 + 0x00))	! initializes cache line
260 261 262
	EX_ST(STORE_INIT(%o2, %o0 + 0x08))
	EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5))
	EX_ST(STORE_INIT(%o3, %o0 + 0x10))
263
	EX_ST(STORE_INIT(%o4, %o0 + 0x18))
264
	EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3))
265
	EX_ST(STORE_INIT(%o5, %o0 + 0x20))
266 267 268 269
	EX_ST(STORE_INIT(%o2, %o0 + 0x28))
	EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5))
	add		%i1, 64, %i1
	EX_ST(STORE_INIT(%o3, %o0 + 0x30))
270 271 272 273 274 275
	EX_ST(STORE_INIT(%o4, %o0 + 0x38))
	subcc		%g1, 64, %g1
	bne,pt		%XCC, 1b
	 add		%o0, 64, %o0

	ba,pt		%XCC, 60f
276
	 add		%i1, 0x8, %i1
277 278 279 280

50:	/* Destination is 64-byte aligned, and source is 16-byte
	 * aligned.
	 */
281 282 283 284 285 286 287
	mov	16, %o7
	mov	32, %g2
	mov	48, %g3
	mov	64, %o1
1:	EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5))
	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3))
	LOAD(prefetch, %i1 + %o1, #one_read)
288 289
	EX_ST(STORE_INIT(%o4, %o0 + 0x00))	! initializes cache line
	EX_ST(STORE_INIT(%o5, %o0 + 0x08))
290 291 292 293 294
	EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5))
	EX_ST(STORE_INIT(%o2, %o0 + 0x10))
	EX_ST(STORE_INIT(%o3, %o0 + 0x18))
	EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3))
	add	%i1, 64, %i1
295 296
	EX_ST(STORE_INIT(%o4, %o0 + 0x20))
	EX_ST(STORE_INIT(%o5, %o0 + 0x28))
297 298
	EX_ST(STORE_INIT(%o2, %o0 + 0x30))
	EX_ST(STORE_INIT(%o3, %o0 + 0x38))
299 300 301 302 303 304
	subcc	%g1, 64, %g1
	bne,pt	%XCC, 1b
	 add	%o0, 64, %o0
	/* fall through */

60:	
305 306
	membar		#Sync

307
	/* %i2 contains any final bytes still needed to be copied
308 309
	 * over. If anything is left, we copy it one byte at a time.
	 */
310 311 312
	RESTORE_ASI(%i3)
	brz,pt		%i2, 85f
	 sub		%o0, %i1, %i3
313 314 315 316 317
	ba,a,pt		%XCC, 90f

	.align		64
70: /* 16 < len <= 64 */
	bne,pn		%XCC, 75f
318
	 sub		%o0, %i1, %i3
319 320

72:
321 322 323
	andn		%i2, 0xf, %i4
	and		%i2, 0xf, %i2
1:	subcc		%i4, 0x10, %i4
324
	EX_LD(LOAD(ldx, %i1, %o4))
325 326 327
	add		%i1, 0x08, %i1
	EX_LD(LOAD(ldx, %i1, %g1))
	sub		%i1, 0x08, %i1
328
	EX_ST(STORE(stx, %o4, %i1 + %i3))
329 330
	add		%i1, 0x8, %i1
	EX_ST(STORE(stx, %g1, %i1 + %i3))
331
	bgu,pt		%XCC, 1b
332 333
	 add		%i1, 0x8, %i1
73:	andcc		%i2, 0x8, %g0
334 335
	be,pt		%XCC, 1f
	 nop
336
	sub		%i2, 0x8, %i2
337 338
	EX_LD(LOAD(ldx, %i1, %o4))
	EX_ST(STORE(stx, %o4, %i1 + %i3))
339 340
	add		%i1, 0x8, %i1
1:	andcc		%i2, 0x4, %g0
341 342
	be,pt		%XCC, 1f
	 nop
343 344 345 346 347
	sub		%i2, 0x4, %i2
	EX_LD(LOAD(lduw, %i1, %i5))
	EX_ST(STORE(stw, %i5, %i1 + %i3))
	add		%i1, 0x4, %i1
1:	cmp		%i2, 0
348 349 350 351 352 353 354 355 356 357
	be,pt		%XCC, 85f
	 nop
	ba,pt		%xcc, 90f
	 nop

75:
	andcc		%o0, 0x7, %g1
	sub		%g1, 0x8, %g1
	be,pn		%icc, 2f
	 sub		%g0, %g1, %g1
358
	sub		%i2, %g1, %i2
359 360

1:	subcc		%g1, 1, %g1
361 362
	EX_LD(LOAD(ldub, %i1, %i5))
	EX_ST(STORE(stb, %i5, %i1 + %i3))
363
	bgu,pt		%icc, 1b
364
	 add		%i1, 1, %i1
365

366 367
2:	add		%i1, %i3, %o0
	andcc		%i1, 0x7, %g1
368 369 370
	bne,pt		%icc, 8f
	 sll		%g1, 3, %g1

371
	cmp		%i2, 16
372 373 374 375
	bgeu,pt		%icc, 72b
	 nop
	ba,a,pt		%xcc, 73b

376 377 378 379 380
8:	mov		64, %i3
	andn		%i1, 0x7, %i1
	EX_LD(LOAD(ldx, %i1, %g2))
	sub		%i3, %g1, %i3
	andn		%i2, 0x7, %i4
381
	sllx		%g2, %g1, %g2
382 383 384 385 386 387
1:	add		%i1, 0x8, %i1
	EX_LD(LOAD(ldx, %i1, %g3))
	subcc		%i4, 0x8, %i4
	srlx		%g3, %i3, %i5
	or		%i5, %g2, %i5
	EX_ST(STORE(stx, %i5, %o0))
388 389 390 391 392
	add		%o0, 0x8, %o0
	bgu,pt		%icc, 1b
	 sllx		%g3, %g1, %g2

	srl		%g1, 3, %g1
393
	andcc		%i2, 0x7, %i2
394
	be,pn		%icc, 85f
395
	 add		%i1, %g1, %i1
396
	ba,pt		%xcc, 90f
397
	 sub		%o0, %i1, %i3
398 399 400

	.align		64
80: /* 0 < len <= 16 */
401
	andcc		%i3, 0x3, %g0
402
	bne,pn		%XCC, 90f
403
	 sub		%o0, %i1, %i3
404 405

1:
406 407 408
	subcc		%i2, 4, %i2
	EX_LD(LOAD(lduw, %i1, %g1))
	EX_ST(STORE(stw, %g1, %i1 + %i3))
409
	bgu,pt		%XCC, 1b
410
	 add		%i1, 4, %i1
411

412 413
85:	ret
	 restore	EX_RETVAL(%i0), %g0, %o0
414 415 416

	.align		32
90:
417 418 419
	subcc		%i2, 1, %i2
	EX_LD(LOAD(ldub, %i1, %g1))
	EX_ST(STORE(stb, %g1, %i1 + %i3))
420
	bgu,pt		%XCC, 90b
421 422 423
	 add		%i1, 1, %i1
	ret
	 restore	EX_RETVAL(%i0), %g0, %o0
424 425

	.size		FUNC_NAME, .-FUNC_NAME