nvc0_grhub.fuc 18.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
/* fuc microcode for nvc0 PGRAPH/HUB
 *
 * Copyright 2011 Red Hat Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 * Authors: Ben Skeggs
 */

/* To build:
 *    m4 nvc0_grhub.fuc | envyas -a -w -m fuc -V nva3 -o nvc0_grhub.fuc.h
 */

30
.section #nvc0_grhub_data
31 32 33 34 35 36 37 38 39 40 41
include(`nvc0_graph.fuc')
gpc_count:		.b32 0
rop_count:		.b32 0
cmd_queue:		queue_init
hub_mmio_list_head:	.b32 0
hub_mmio_list_tail:	.b32 0

ctx_current:		.b32 0

chipsets:
.b8  0xc0 0 0 0
42 43
.b16 #nvc0_hub_mmio_head
.b16 #nvc0_hub_mmio_tail
44
.b8  0xc1 0 0 0
45 46
.b16 #nvc0_hub_mmio_head
.b16 #nvc1_hub_mmio_tail
47
.b8  0xc3 0 0 0
48 49
.b16 #nvc0_hub_mmio_head
.b16 #nvc0_hub_mmio_tail
50
.b8  0xc4 0 0 0
51 52
.b16 #nvc0_hub_mmio_head
.b16 #nvc0_hub_mmio_tail
53
.b8  0xc8 0 0 0
54 55
.b16 #nvc0_hub_mmio_head
.b16 #nvc0_hub_mmio_tail
56
.b8  0xce 0 0 0
57 58
.b16 #nvc0_hub_mmio_head
.b16 #nvc0_hub_mmio_tail
59
.b8  0xcf 0 0 0
60 61
.b16 #nvc0_hub_mmio_head
.b16 #nvc0_hub_mmio_tail
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
.b8  0 0 0 0

nvc0_hub_mmio_head:
mmctx_data(0x17e91c, 2)
mmctx_data(0x400204, 2)
mmctx_data(0x404004, 11)
mmctx_data(0x404044, 1)
mmctx_data(0x404094, 14)
mmctx_data(0x4040d0, 7)
mmctx_data(0x4040f8, 1)
mmctx_data(0x404130, 3)
mmctx_data(0x404150, 3)
mmctx_data(0x404164, 2)
mmctx_data(0x404174, 3)
mmctx_data(0x404200, 8)
mmctx_data(0x404404, 14)
mmctx_data(0x404460, 4)
mmctx_data(0x404480, 1)
mmctx_data(0x404498, 1)
mmctx_data(0x404604, 4)
mmctx_data(0x404618, 32)
mmctx_data(0x404698, 21)
mmctx_data(0x4046f0, 2)
mmctx_data(0x404700, 22)
mmctx_data(0x405800, 1)
mmctx_data(0x405830, 3)
mmctx_data(0x405854, 1)
mmctx_data(0x405870, 4)
mmctx_data(0x405a00, 2)
mmctx_data(0x405a18, 1)
mmctx_data(0x406020, 1)
mmctx_data(0x406028, 4)
mmctx_data(0x4064a8, 2)
mmctx_data(0x4064b4, 2)
mmctx_data(0x407804, 1)
mmctx_data(0x40780c, 6)
mmctx_data(0x4078bc, 1)
mmctx_data(0x408000, 7)
mmctx_data(0x408064, 1)
mmctx_data(0x408800, 3)
mmctx_data(0x408900, 4)
mmctx_data(0x408980, 1)
nvc0_hub_mmio_tail:
mmctx_data(0x4064c0, 2)
nvc1_hub_mmio_tail:

.align 256
chan_data:
chan_mmio_count:	.b32 0
chan_mmio_address:	.b32 0

.align 256
xfer_data: 		.b32 0

116 117
.section #nvc0_grhub_code
bra #init
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
define(`include_code')
include(`nvc0_graph.fuc')

// reports an exception to the host
//
// In: $r15 error code (see nvc0_graph.fuc)
//
error:
	push $r14
	mov $r14 0x814
	shl b32 $r14 6
	iowr I[$r14 + 0x000] $r15	// CC_SCRATCH[5] = error code
	mov $r14 0xc1c
	shl b32 $r14 6
	mov $r15 1
	iowr I[$r14 + 0x000] $r15	// INTR_UP_SET
	pop $r14
	ret

// HUB fuc initialisation, executed by triggering ucode start, will
// fall through to main loop after completion.
//
// Input:
//   CC_SCRATCH[0]: chipset (PMC_BOOT_0 read returns 0x0bad0bad... sigh)
//
// Output:
//   CC_SCRATCH[0]:
//	     31:31: set to signal completion
//   CC_SCRATCH[1]:
//	      31:0: total PGRAPH context size
//
init:
	clear b32 $r0
	mov $sp $r0
	mov $xdbase $r0

	// enable fifo access
	mov $r1 0x1200
	mov $r2 2
	iowr I[$r1 + 0x000] $r2	// FIFO_ENABLE

	// setup i0 handler, and route all interrupts to it
160
	mov $r1 #ih
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
	mov $iv0 $r1
	mov $r1 0x400
	iowr I[$r1 + 0x300] $r0	// INTR_DISPATCH

	// route HUB_CHANNEL_SWITCH to fuc interrupt 8
	mov $r3 0x404
	shl b32 $r3 6
	mov $r2 0x2003		// { HUB_CHANNEL_SWITCH, ZERO } -> intr 8
	iowr I[$r3 + 0x000] $r2

	// not sure what these are, route them because NVIDIA does, and
	// the IRQ handler will signal the host if we ever get one.. we
	// may find out if/why we need to handle these if so..
	//
	mov $r2 0x2004
	iowr I[$r3 + 0x004] $r2 // { 0x04, ZERO } -> intr 9
	mov $r2 0x200b
	iowr I[$r3 + 0x008] $r2 // { 0x0b, ZERO } -> intr 10
	mov $r2 0x200c
	iowr I[$r3 + 0x01c] $r2 // { 0x0c, ZERO } -> intr 15

	// enable all INTR_UP interrupts
	mov $r2 0xc24
	shl b32 $r2 6
	not b32 $r3 $r0
	iowr I[$r2] $r3

	// enable fifo, ctxsw, 9, 10, 15 interrupts
	mov $r2 -0x78fc		// 0x8704
	sethi $r2 0
	iowr I[$r1 + 0x000] $r2	// INTR_EN_SET

	// fifo level triggered, rest edge
	sub b32 $r1 0x100
	mov $r2 4
	iowr I[$r1] $r2

	// enable interrupts
	bset $flags ie0

	// fetch enabled GPC/ROP counts
	mov $r14 -0x69fc	// 0x409604
	sethi $r14 0x400000
204
	call #nv_rd32
205
	extr $r1 $r15 16:20
206
	st b32 D[$r0 + #rop_count] $r1
207
	and $r15 0x1f
208
	st b32 D[$r0 + #gpc_count] $r15
209 210 211 212 213 214 215 216 217 218 219 220 221 222

	// set BAR_REQMASK to GPC mask
	mov $r1 1
	shl b32 $r1 $r15
	sub b32 $r1 1
	mov $r2 0x40c
	shl b32 $r2 6
	iowr I[$r2 + 0x000] $r1
	iowr I[$r2 + 0x100] $r1

	// find context data for this chipset
	mov $r2 0x800
	shl b32 $r2 6
	iord $r2 I[$r2 + 0x000]		// CC_SCRATCH[0]
223
	mov $r15 #chipsets - 8
224 225 226 227
	init_find_chipset:
		add b32 $r15 8
		ld b32 $r3 D[$r15 + 0x00]
		cmpu b32 $r3 $r2
228
		bra e #init_context
229
		cmpu b32 $r3 0
230
		bra ne #init_find_chipset
231 232 233 234 235 236 237 238 239 240 241
		// unknown chipset
		ret

	// context size calculation, reserve first 256 bytes for use by fuc
	init_context:
	mov $r1 256

	// calculate size of mmio context data
	ld b16 $r14 D[$r15 + 4]
	ld b16 $r15 D[$r15 + 6]
	sethi $r14 0
242 243 244
	st b32 D[$r0 + #hub_mmio_list_head] $r14
	st b32 D[$r0 + #hub_mmio_list_tail] $r15
	call #mmctx_size
245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262

	// set mmctx base addresses now so we don't have to do it later,
	// they don't (currently) ever change
	mov $r3 0x700
	shl b32 $r3 6
	shr b32 $r4 $r1 8
	iowr I[$r3 + 0x000] $r4		// MMCTX_SAVE_SWBASE
	iowr I[$r3 + 0x100] $r4		// MMCTX_LOAD_SWBASE
	add b32 $r3 0x1300
	add b32 $r1 $r15
	shr b32 $r15 2
	iowr I[$r3 + 0x000] $r15	// MMCTX_LOAD_COUNT, wtf for?!?

	// strands, base offset needs to be aligned to 256 bytes
	shr b32 $r1 8
	add b32 $r1 1
	shl b32 $r1 8
	mov b32 $r15 $r1
263
	call #strand_ctx_init
264 265 266 267 268 269 270 271 272 273
	add b32 $r1 $r15

	// initialise each GPC in sequence by passing in the offset of its
	// context data in GPCn_CC_SCRATCH[1], and starting its FUC (which
	// has previously been uploaded by the host) running.
	//
	// the GPC fuc init sequence will set GPCn_CC_SCRATCH[0] bit 31
	// when it has completed, and return the size of its context data
	// in GPCn_CC_SCRATCH[1]
	//
274
	ld b32 $r3 D[$r0 + #gpc_count]
275 276 277 278 279 280
	mov $r4 0x2000
	sethi $r4 0x500000
	init_gpc:
		// setup, and start GPC ucode running
		add b32 $r14 $r4 0x804
		mov b32 $r15 $r1
281
		call #nv_wr32			// CC_SCRATCH[1] = ctx offset
282 283
		add b32 $r14 $r4 0x800
		mov b32 $r15 $r2
284
		call #nv_wr32			// CC_SCRATCH[0] = chipset
285 286
		add b32 $r14 $r4 0x10c
		clear b32 $r15
287
		call #nv_wr32
288
		add b32 $r14 $r4 0x104
289
		call #nv_wr32			// ENTRY
290 291
		add b32 $r14 $r4 0x100
		mov $r15 2			// CTRL_START_TRIGGER
292
		call #nv_wr32			// CTRL
293 294 295 296

		// wait for it to complete, and adjust context size
		add b32 $r14 $r4 0x800
		init_gpc_wait:
297
			call #nv_rd32
298
			xbit $r15 $r15 31
299
			bra e #init_gpc_wait
300
		add b32 $r14 $r4 0x804
301
		call #nv_rd32
302 303 304 305 306
		add b32 $r1 $r15

		// next!
		add b32 $r4 0x8000
		sub b32 $r3 1
307
		bra ne #init_gpc
308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324

	// save context size, and tell host we're ready
	mov $r2 0x800
	shl b32 $r2 6
	iowr I[$r2 + 0x100] $r1		// CC_SCRATCH[1]  = context size
	add b32 $r2 0x800
	clear b32 $r1
	bset $r1 31
	iowr I[$r2 + 0x000] $r1		// CC_SCRATCH[0] |= 0x80000000

// Main program loop, very simple, sleeps until woken up by the interrupt
// handler, pulls a command from the queue and executes its handler
//
main:
	// sleep until we have something to do
	bset $flags $p0
	sleep $p0
325 326 327
	mov $r13 #cmd_queue
	call #queue_get
	bra $p1 #main
328 329 330

	// context switch, requested by GPU?
	cmpu b32 $r14 0x4001
331
	bra ne #main_not_ctx_switch
332 333 334 335 336 337 338
		trace_set(T_AUTO)
		mov $r1 0xb00
		shl b32 $r1 6
		iord $r2 I[$r1 + 0x100]		// CHAN_NEXT
		iord $r1 I[$r1 + 0x000]		// CHAN_CUR

		xbit $r3 $r1 31
339
		bra e #chsw_no_prev
340
			xbit $r3 $r2 31
341
			bra e #chsw_prev_no_next
342 343 344 345 346
				push $r2
				mov b32 $r2 $r1
				trace_set(T_SAVE)
				bclr $flags $p1
				bset $flags $p2
347
				call #ctx_xfer
348 349 350 351
				trace_clr(T_SAVE);
				pop $r2
				trace_set(T_LOAD);
				bset $flags $p1
352
				call #ctx_xfer
353
				trace_clr(T_LOAD);
354
				bra #chsw_done
355 356 357 358 359
			chsw_prev_no_next:
				push $r2
				mov b32 $r2 $r1
				bclr $flags $p1
				bclr $flags $p2
360
				call #ctx_xfer
361 362 363 364
				pop $r2
				mov $r1 0xb00
				shl b32 $r1 6
				iowr I[$r1] $r2
365
				bra #chsw_done
366 367
		chsw_no_prev:
			xbit $r3 $r2 31
368
			bra e #chsw_done
369 370
				bset $flags $p1
				bclr $flags $p2
371
				call #ctx_xfer
372 373 374 375 376 377 378 379

		// ack the context switch request
		chsw_done:
		mov $r1 0xb0c
		shl b32 $r1 6
		mov $r2 1
		iowr I[$r1 + 0x000] $r2		// 0x409b0c
		trace_clr(T_AUTO)
380
		bra #main
381 382 383 384

	// request to set current channel? (*not* a context switch)
	main_not_ctx_switch:
	cmpu b32 $r14 0x0001
385
	bra ne #main_not_ctx_chan
386
		mov b32 $r2 $r15
387 388
		call #ctx_chan
		bra #main_done
389 390 391 392

	// request to store current channel context?
	main_not_ctx_chan:
	cmpu b32 $r14 0x0002
393
	bra ne #main_not_ctx_save
394 395 396
		trace_set(T_SAVE)
		bclr $flags $p1
		bclr $flags $p2
397
		call #ctx_xfer
398
		trace_clr(T_SAVE)
399
		bra #main_done
400 401 402 403

	main_not_ctx_save:
		shl b32 $r15 $r14 16
		or $r15 E_BAD_COMMAND
404 405
		call #error
		bra #main
406 407 408 409 410 411 412

	main_done:
	mov $r1 0x820
	shl b32 $r1 6
	clear b32 $r2
	bset $r2 31
	iowr I[$r1 + 0x000] $r2		// CC_SCRATCH[0] |= 0x80000000
413
	bra #main
414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429

// interrupt handler
ih:
	push $r8
	mov $r8 $flags
	push $r8
	push $r9
	push $r10
	push $r11
	push $r13
	push $r14
	push $r15

	// incoming fifo command?
	iord $r10 I[$r0 + 0x200]	// INTR
	and $r11 $r10 0x00000004
430
	bra e #ih_no_fifo
431 432
		// queue incoming fifo command for later processing
		mov $r11 0x1900
433
		mov $r13 #cmd_queue
434 435
		iord $r14 I[$r11 + 0x100]	// FIFO_CMD
		iord $r15 I[$r11 + 0x000]	// FIFO_DATA
436
		call #queue_put
437 438 439 440 441 442 443
		add b32 $r11 0x400
		mov $r14 1
		iowr I[$r11 + 0x000] $r14	// FIFO_ACK

	// context switch request?
	ih_no_fifo:
	and $r11 $r10 0x00000100
444
	bra e #ih_no_ctxsw
445
		// enqueue a context switch for later processing
446
		mov $r13 #cmd_queue
447
		mov $r14 0x4001
448
		call #queue_put
449 450 451 452 453 454

	// anything we didn't handle, bring it to the host's attention
	ih_no_ctxsw:
	mov $r11 0x104
	not b32 $r11
	and $r11 $r10 $r11
455
	bra e #ih_no_other
456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480
		mov $r10 0xc1c
		shl b32 $r10 6
		iowr I[$r10] $r11	// INTR_UP_SET

	// ack, and wake up main()
	ih_no_other:
	iowr I[$r0 + 0x100] $r10	// INTR_ACK

	pop $r15
	pop $r14
	pop $r13
	pop $r11
	pop $r10
	pop $r9
	pop $r8
	mov $flags $r8
	pop $r8
	bclr $flags $p0
	iret

// Not real sure, but, MEM_CMD 7 will hang forever if this isn't done
ctx_4160s:
	mov $r14 0x4160
	sethi $r14 0x400000
	mov $r15 1
481
	call #nv_wr32
482
	ctx_4160s_wait:
483
		call #nv_rd32
484
		xbit $r15 $r15 4
485
		bra e #ctx_4160s_wait
486 487 488 489 490 491 492 493 494
	ret

// Without clearing again at end of xfer, some things cause PGRAPH
// to hang with STATUS=0x00000007 until it's cleared.. fbcon can
// still function with it set however...
ctx_4160c:
	mov $r14 0x4160
	sethi $r14 0x400000
	clear b32 $r15
495
	call #nv_wr32
496 497 498 499 500 501 502 503 504 505
	ret

// Again, not real sure
//
// In: $r15 value to set 0x404170 to
//
ctx_4170s:
	mov $r14 0x4170
	sethi $r14 0x400000
	or $r15 0x10
506
	call #nv_wr32
507 508 509 510 511 512 513
	ret

// Waits for a ctx_4170s() call to complete
//
ctx_4170w:
	mov $r14 0x4170
	sethi $r14 0x400000
514
	call #nv_rd32
515
	and $r15 0x10
516
	bra ne #ctx_4170w
517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532
	ret

// Disables various things, waits a bit, and re-enables them..
//
// Not sure how exactly this helps, perhaps "ENABLE" is not such a
// good description for the bits we turn off?  Anyways, without this,
// funny things happen.
//
ctx_redswitch:
	mov $r14 0x614
	shl b32 $r14 6
	mov $r15 0x270
	iowr I[$r14] $r15	// HUB_RED_SWITCH = ENABLE_GPC, POWER_ALL
	mov $r15 8
	ctx_redswitch_delay:
		sub b32 $r15 1
533
		bra ne #ctx_redswitch_delay
534 535 536 537 538 539 540 541 542 543 544 545 546 547 548
	mov $r15 0x770
	iowr I[$r14] $r15	// HUB_RED_SWITCH = ENABLE_ALL, POWER_ALL
	ret

// Not a clue what this is for, except that unless the value is 0x10, the
// strand context is saved (and presumably restored) incorrectly..
//
// In: $r15 value to set to (0x00/0x10 are used)
//
ctx_86c:
	mov $r14 0x86c
	shl b32 $r14 6
	iowr I[$r14] $r15	// HUB(0x86c) = val
	mov $r14 -0x75ec
	sethi $r14 0x400000
549
	call #nv_wr32		// ROP(0xa14) = val
550 551
	mov $r14 -0x5794
	sethi $r14 0x410000
552
	call #nv_wr32		// GPC(0x86c) = val
553 554 555 556 557 558 559 560 561 562 563
	ret

// ctx_load - load's a channel's ctxctl data, and selects its vm
//
// In: $r2 channel address
//
ctx_load:
	trace_set(T_CHAN)

	// switch to channel, somewhat magic in parts..
	mov $r10 12		// DONE_UNK12
564
	call #wait_donez
565 566 567 568 569 570 571 572 573 574 575 576 577 578
	mov $r1 0xa24
	shl b32 $r1 6
	iowr I[$r1 + 0x000] $r0	// 0x409a24
	mov $r3 0xb00
	shl b32 $r3 6
	iowr I[$r3 + 0x100] $r2	// CHAN_NEXT
	mov $r1 0xa0c
	shl b32 $r1 6
	mov $r4 7
	iowr I[$r1 + 0x000] $r2 // MEM_CHAN
	iowr I[$r1 + 0x100] $r4	// MEM_CMD
	ctx_chan_wait_0:
		iord $r4 I[$r1 + 0x100]
		and $r4 0x1f
579
		bra ne #ctx_chan_wait_0
580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597
	iowr I[$r3 + 0x000] $r2	// CHAN_CUR

	// load channel header, fetch PGRAPH context pointer
	mov $xtargets $r0
	bclr $r2 31
	shl b32 $r2 4
	add b32 $r2 2

	trace_set(T_LCHAN)
	mov $r1 0xa04
	shl b32 $r1 6
	iowr I[$r1 + 0x000] $r2		// MEM_BASE
	mov $r1 0xa20
	shl b32 $r1 6
	mov $r2 0x0002
	sethi $r2 0x80000000
	iowr I[$r1 + 0x000] $r2		// MEM_TARGET = vram
	mov $r1 0x10			// chan + 0x0210
598
	mov $r2 #xfer_data
599 600 601 602 603 604
	sethi $r2 0x00020000		// 16 bytes
	xdld $r1 $r2
	xdwait
	trace_clr(T_LCHAN)

	// update current context
605
	ld b32 $r1 D[$r0 + #xfer_data + 4]
606
	shl b32 $r1 24
607
	ld b32 $r2 D[$r0 + #xfer_data + 0]
608 609
	shr b32 $r2 8
	or $r1 $r2
610
	st b32 D[$r0 + #ctx_current] $r1
611 612 613 614 615 616 617 618 619 620

	// set transfer base to start of context, and fetch context header
	trace_set(T_LCTXH)
	mov $r2 0xa04
	shl b32 $r2 6
	iowr I[$r2 + 0x000] $r1		// MEM_BASE
	mov $r2 1
	mov $r1 0xa20
	shl b32 $r1 6
	iowr I[$r1 + 0x000] $r2		// MEM_TARGET = vm
621
	mov $r1 #chan_data
622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637
	sethi $r1 0x00060000		// 256 bytes
	xdld $r0 $r1
	xdwait
	trace_clr(T_LCTXH)

	trace_clr(T_CHAN)
	ret

// ctx_chan - handler for HUB_SET_CHAN command, will set a channel as
//            the active channel for ctxctl, but not actually transfer
//            any context data.  intended for use only during initial
//            context construction.
//
// In: $r2 channel address
//
ctx_chan:
638 639
	call #ctx_4160s
	call #ctx_load
640
	mov $r10 12			// DONE_UNK12
641
	call #wait_donez
642 643 644 645 646 647 648
	mov $r1 0xa10
	shl b32 $r1 6
	mov $r2 5
	iowr I[$r1 + 0x000] $r2		// MEM_CMD = 5 (???)
	ctx_chan_wait:
		iord $r2 I[$r1 + 0x000]
		or $r2 $r2
649 650
		bra ne #ctx_chan_wait
	call #ctx_4160c
651 652 653 654 655 656 657 658 659 660 661 662 663
	ret

// Execute per-context state overrides list
//
// Only executed on the first load of a channel.  Might want to look into
// removing this and having the host directly modify the channel's context
// to change this state...  The nouveau DRM already builds this list as
// it's definitely needed for NVIDIA's, so we may as well use it for now
//
// Input: $r1 mmio list length
//
ctx_mmio_exec:
	// set transfer base to be the mmio list
664
	ld b32 $r3 D[$r0 + #chan_mmio_address]
665 666 667 668 669 670 671 672
	mov $r2 0xa04
	shl b32 $r2 6
	iowr I[$r2 + 0x000] $r3		// MEM_BASE

	clear b32 $r3
	ctx_mmio_loop:
		// fetch next 256 bytes of mmio list if necessary
		and $r4 $r3 0xff
673 674
		bra ne #ctx_mmio_pull
			mov $r5 #xfer_data
675 676 677 678 679 680
			sethi $r5 0x00060000	// 256 bytes
			xdld $r3 $r5
			xdwait

		// execute a single list entry
		ctx_mmio_pull:
681 682 683
		ld b32 $r14 D[$r4 + #xfer_data + 0x00]
		ld b32 $r15 D[$r4 + #xfer_data + 0x04]
		call #nv_wr32
684 685 686 687

		// next!
		add b32 $r3 8
		sub b32 $r1 1
688
		bra ne #ctx_mmio_loop
689 690 691

	// set transfer base back to the current context
	ctx_mmio_done:
692
	ld b32 $r3 D[$r0 + #ctx_current]
693 694 695
	iowr I[$r2 + 0x000] $r3		// MEM_BASE

	// disable the mmio list now, we don't need/want to execute it again
696 697
	st b32 D[$r0 + #chan_mmio_count] $r0
	mov $r1 #chan_data
698 699 700 701 702 703 704 705 706 707 708 709 710 711
	sethi $r1 0x00060000		// 256 bytes
	xdst $r0 $r1
	xdwait
	ret

// Transfer HUB context data between GPU and storage area
//
// In: $r2 channel address
//     $p1 clear on save, set on load
//     $p2 set if opposite direction done/will be done, so:
//		on save it means: "a load will follow this save"
//		on load it means: "a save preceeded this load"
//
ctx_xfer:
712 713
	bra not $p1 #ctx_xfer_pre
	bra $p2 #ctx_xfer_pre_load
714 715
	ctx_xfer_pre:
		mov $r15 0x10
716 717 718
		call #ctx_86c
		call #ctx_4160s
		bra not $p1 #ctx_xfer_exec
719 720 721

	ctx_xfer_pre_load:
		mov $r15 2
722 723 724
		call #ctx_4170s
		call #ctx_4170w
		call #ctx_redswitch
725
		clear b32 $r15
726 727
		call #ctx_4170s
		call #ctx_load
728 729 730

	// fetch context pointer, and initiate xfer on all GPCs
	ctx_xfer_exec:
731
	ld b32 $r1 D[$r0 + #ctx_current]
732 733 734 735 736 737
	mov $r2 0x414
	shl b32 $r2 6
	iowr I[$r2 + 0x000] $r0	// BAR_STATUS = reset
	mov $r14 -0x5b00
	sethi $r14 0x410000
	mov b32 $r15 $r1
738
	call #nv_wr32		// GPC_BCAST_WRCMD_DATA = ctx pointer
739 740 741 742 743
	add b32 $r14 4
	xbit $r15 $flags $p1
	xbit $r2 $flags $p2
	shl b32 $r2 1
	or $r15 $r2
744
	call #nv_wr32		// GPC_BCAST_WRCMD_CMD = GPC_XFER(type)
745 746 747 748 749 750

	// strands
	mov $r1 0x4afc
	sethi $r1 0x20000
	mov $r2 0xc
	iowr I[$r1] $r2		// STRAND_CMD(0x3f) = 0x0c
751
	call #strand_wait
752 753 754 755 756 757 758 759 760 761 762
	mov $r2 0x47fc
	sethi $r2 0x20000
	iowr I[$r2] $r0		// STRAND_FIRST_GENE(0x3f) = 0x00
	xbit $r2 $flags $p1
	add b32 $r2 3
	iowr I[$r1] $r2		// STRAND_CMD(0x3f) = 0x03/0x04 (SAVE/LOAD)

	// mmio context
	xbit $r10 $flags $p1	// direction
	or $r10 6		// first, last
	mov $r11 0		// base = 0
763 764
	ld b32 $r12 D[$r0 + #hub_mmio_list_head]
	ld b32 $r13 D[$r0 + #hub_mmio_list_tail]
765
	mov $r14 0		// not multi
766
	call #mmctx_xfer
767 768 769

	// wait for GPCs to all complete
	mov $r10 8		// DONE_BAR
770
	call #wait_doneo
771 772

	// wait for strand xfer to complete
773
	call #strand_wait
774 775

	// post-op
776
	bra $p1 #ctx_xfer_post
777
		mov $r10 12		// DONE_UNK12
778
		call #wait_donez
779 780 781 782 783 784 785
		mov $r1 0xa10
		shl b32 $r1 6
		mov $r2 5
		iowr I[$r1] $r2		// MEM_CMD
		ctx_xfer_post_save_wait:
			iord $r2 I[$r1]
			or $r2 $r2
786
			bra ne #ctx_xfer_post_save_wait
787

788
	bra $p2 #ctx_xfer_done
789 790
	ctx_xfer_post:
		mov $r15 2
791
		call #ctx_4170s
792
		clear b32 $r15
793 794 795
		call #ctx_86c
		call #strand_post
		call #ctx_4170w
796
		clear b32 $r15
797
		call #ctx_4170s
798

799 800
		bra not $p1 #ctx_xfer_no_post_mmio
		ld b32 $r1 D[$r0 + #chan_mmio_count]
801
		or $r1 $r1
802 803
		bra e #ctx_xfer_no_post_mmio
			call #ctx_mmio_exec
804 805

		ctx_xfer_no_post_mmio:
806
		call #ctx_4160c
807 808 809 810 811

	ctx_xfer_done:
	ret

.align 256