intel_lrc.c 79.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
/*
 * Copyright © 2014 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 *
 * Authors:
 *    Ben Widawsky <ben@bwidawsk.net>
 *    Michel Thierry <michel.thierry@intel.com>
 *    Thomas Daniel <thomas.daniel@intel.com>
 *    Oscar Mateo <oscar.mateo@intel.com>
 *
 */

31 32 33 34
/**
 * DOC: Logical Rings, Logical Ring Contexts and Execlists
 *
 * Motivation:
35 36 37 38
 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
 * These expanded contexts enable a number of new abilities, especially
 * "Execlists" (also implemented in this file).
 *
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
 * One of the main differences with the legacy HW contexts is that logical
 * ring contexts incorporate many more things to the context's state, like
 * PDPs or ringbuffer control registers:
 *
 * The reason why PDPs are included in the context is straightforward: as
 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
 * contained there mean you don't need to do a ppgtt->switch_mm yourself,
 * instead, the GPU will do it for you on the context switch.
 *
 * But, what about the ringbuffer control registers (head, tail, etc..)?
 * shouldn't we just need a set of those per engine command streamer? This is
 * where the name "Logical Rings" starts to make sense: by virtualizing the
 * rings, the engine cs shifts to a new "ring buffer" with every context
 * switch. When you want to submit a workload to the GPU you: A) choose your
 * context, B) find its appropriate virtualized ring, C) write commands to it
 * and then, finally, D) tell the GPU to switch to that context.
 *
 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
 * to a contexts is via a context execution list, ergo "Execlists".
 *
 * LRC implementation:
 * Regarding the creation of contexts, we have:
 *
 * - One global default context.
 * - One local default context for each opened fd.
 * - One local extra context for each context create ioctl call.
 *
 * Now that ringbuffers belong per-context (and not per-engine, like before)
 * and that contexts are uniquely tied to a given engine (and not reusable,
 * like before) we need:
 *
 * - One ringbuffer per-engine inside each context.
 * - One backing object per-engine inside each context.
 *
 * The global default context starts its life with these new objects fully
 * allocated and populated. The local default context for each opened fd is
 * more complex, because we don't know at creation time which engine is going
 * to use them. To handle this, we have implemented a deferred creation of LR
 * contexts:
 *
 * The local context starts its life as a hollow or blank holder, that only
 * gets populated for a given engine once we receive an execbuffer. If later
 * on we receive another execbuffer ioctl for the same context but a different
 * engine, we allocate/populate a new ringbuffer and context backing object and
 * so on.
 *
 * Finally, regarding local contexts created using the ioctl call: as they are
 * only allowed with the render ring, we can allocate & populate them right
 * away (no need to defer anything, at least for now).
 *
 * Execlists implementation:
90 91
 * Execlists are the new method by which, on gen8+ hardware, workloads are
 * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
 * This method works as follows:
 *
 * When a request is committed, its commands (the BB start and any leading or
 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
 * for the appropriate context. The tail pointer in the hardware context is not
 * updated at this time, but instead, kept by the driver in the ringbuffer
 * structure. A structure representing this request is added to a request queue
 * for the appropriate engine: this structure contains a copy of the context's
 * tail after the request was written to the ring buffer and a pointer to the
 * context itself.
 *
 * If the engine's request queue was empty before the request was added, the
 * queue is processed immediately. Otherwise the queue will be processed during
 * a context switch interrupt. In any case, elements on the queue will get sent
 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
 * globally unique 20-bits submission ID.
 *
 * When execution of a request completes, the GPU updates the context status
 * buffer with a context complete event and generates a context switch interrupt.
 * During the interrupt handling, the driver examines the events in the buffer:
 * for each context complete event, if the announced ID matches that on the head
 * of the request queue, then that request is retired and removed from the queue.
 *
 * After processing, if any requests were retired and the queue is not empty
 * then a new execution list can be submitted. The two requests at the front of
 * the queue are next to be submitted but since a context may not occur twice in
 * an execution list, if subsequent requests have the same ID as the first then
 * the two requests must be combined. This is done simply by discarding requests
 * at the head of the queue until either only one requests is left (in which case
 * we use a NULL second context) or the first two requests have unique IDs.
 *
 * By always executing the first two requests in the queue the driver ensures
 * that the GPU is kept as busy as possible. In the case where a single context
 * completes but a second context is still executing, the request for this second
 * context will be at the head of the queue when we remove the first one. This
 * request will then be resubmitted along with a new request for a different context,
 * which will cause the hardware to continue executing the second request and queue
 * the new request (the GPU detects the condition of a context getting preempted
 * with the same context and optimizes the context switch flow by not doing
 * preemption, but just sampling the new tail pointer).
 *
133
 */
134
#include <linux/interrupt.h>
135 136 137 138

#include <drm/drmP.h>
#include <drm/i915_drm.h>
#include "i915_drv.h"
139
#include "i915_gem_render_state.h"
140
#include "intel_lrc_reg.h"
141
#include "intel_mocs.h"
142

143 144 145 146 147 148 149 150 151 152 153 154 155
#define RING_EXECLIST_QFULL		(1 << 0x2)
#define RING_EXECLIST1_VALID		(1 << 0x3)
#define RING_EXECLIST0_VALID		(1 << 0x4)
#define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
#define RING_EXECLIST1_ACTIVE		(1 << 0x11)
#define RING_EXECLIST0_ACTIVE		(1 << 0x12)

#define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
#define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
#define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
#define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
#define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
#define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
156

157
#define GEN8_CTX_STATUS_COMPLETED_MASK \
158
	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
159

160 161
/* Typical size of the average request (2 pipecontrols and a MI_BB) */
#define EXECLISTS_REQUEST_SIZE 64 /* bytes */
162
#define WA_TAIL_DWORDS 2
163
#define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
164

165
static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
166
					    struct intel_engine_cs *engine);
167 168 169 170
static void execlists_init_reg_state(u32 *reg_state,
				     struct i915_gem_context *ctx,
				     struct intel_engine_cs *engine,
				     struct intel_ring *ring);
171

172 173 174 175 176 177 178 179 180 181 182 183 184 185
static inline struct i915_priolist *to_priolist(struct rb_node *rb)
{
	return rb_entry(rb, struct i915_priolist, node);
}

static inline int rq_prio(const struct i915_request *rq)
{
	return rq->priotree.priority;
}

static inline bool need_preempt(const struct intel_engine_cs *engine,
				const struct i915_request *last,
				int prio)
{
186 187
	return (intel_engine_has_preemption(engine) &&
		__execlists_need_preempt(prio, rq_prio(last)));
188 189
}

190
/**
191 192 193
 * intel_lr_context_descriptor_update() - calculate & cache the descriptor
 * 					  descriptor for a pinned context
 * @ctx: Context to work on
194
 * @engine: Engine the descriptor will be used with
195
 *
196 197 198 199 200
 * The context descriptor encodes various attributes of a context,
 * including its GTT address and some flags. Because it's fairly
 * expensive to calculate, we'll just do it once and cache the result,
 * which remains valid until the context is unpinned.
 *
201 202
 * This is what a descriptor looks like, from LSB to MSB::
 *
203
 *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
204 205 206 207
 *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
 *      bits 32-52:    ctx ID, a globally unique tag
 *      bits 53-54:    mbz, reserved for use by hardware
 *      bits 55-63:    group ID, currently unused and set to 0
208 209 210 211 212 213 214 215 216 217 218 219
 *
 * Starting from Gen11, the upper dword of the descriptor has a new format:
 *
 *      bits 32-36:    reserved
 *      bits 37-47:    SW context ID
 *      bits 48:53:    engine instance
 *      bit 54:        mbz, reserved for use by hardware
 *      bits 55-60:    SW counter
 *      bits 61-63:    engine class
 *
 * engine info, SW context ID and SW counter need to form a unique number
 * (Context ID) per lrc.
220
 */
221
static void
222
intel_lr_context_descriptor_update(struct i915_gem_context *ctx,
223
				   struct intel_engine_cs *engine)
224
{
225
	struct intel_context *ce = &ctx->engine[engine->id];
226
	u64 desc;
227

228 229
	BUILD_BUG_ON(MAX_CONTEXT_HW_ID > (BIT(GEN8_CTX_ID_WIDTH)));
	BUILD_BUG_ON(GEN11_MAX_CONTEXT_HW_ID > (BIT(GEN11_SW_CTX_ID_WIDTH)));
230

231
	desc = ctx->desc_template;				/* bits  0-11 */
232 233
	GEM_BUG_ON(desc & GENMASK_ULL(63, 12));

234
	desc |= i915_ggtt_offset(ce->state) + LRC_HEADER_PAGES * PAGE_SIZE;
235
								/* bits 12-31 */
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253
	GEM_BUG_ON(desc & GENMASK_ULL(63, 32));

	if (INTEL_GEN(ctx->i915) >= 11) {
		GEM_BUG_ON(ctx->hw_id >= BIT(GEN11_SW_CTX_ID_WIDTH));
		desc |= (u64)ctx->hw_id << GEN11_SW_CTX_ID_SHIFT;
								/* bits 37-47 */

		desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
								/* bits 48-53 */

		/* TODO: decide what to do with SW counter (bits 55-60) */

		desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
								/* bits 61-63 */
	} else {
		GEM_BUG_ON(ctx->hw_id >= BIT(GEN8_CTX_ID_WIDTH));
		desc |= (u64)ctx->hw_id << GEN8_CTX_ID_SHIFT;	/* bits 32-52 */
	}
254

255
	ce->lrc_desc = desc;
256 257
}

258 259 260 261
static struct i915_priolist *
lookup_priolist(struct intel_engine_cs *engine,
		struct i915_priotree *pt,
		int prio)
262
{
263
	struct intel_engine_execlists * const execlists = &engine->execlists;
264 265 266 267
	struct i915_priolist *p;
	struct rb_node **parent, *rb;
	bool first = true;

268
	if (unlikely(execlists->no_priolist))
269 270 271 272 273
		prio = I915_PRIORITY_NORMAL;

find_priolist:
	/* most positive priority is scheduled first, equal priorities fifo */
	rb = NULL;
274
	parent = &execlists->queue.rb_node;
275 276
	while (*parent) {
		rb = *parent;
277
		p = to_priolist(rb);
278 279 280 281 282 283
		if (prio > p->priority) {
			parent = &rb->rb_left;
		} else if (prio < p->priority) {
			parent = &rb->rb_right;
			first = false;
		} else {
284
			return p;
285 286 287 288
		}
	}

	if (prio == I915_PRIORITY_NORMAL) {
289
		p = &execlists->default_priolist;
290 291 292 293 294 295 296 297 298 299 300 301 302 303
	} else {
		p = kmem_cache_alloc(engine->i915->priorities, GFP_ATOMIC);
		/* Convert an allocation failure to a priority bump */
		if (unlikely(!p)) {
			prio = I915_PRIORITY_NORMAL; /* recurses just once */

			/* To maintain ordering with all rendering, after an
			 * allocation failure we have to disable all scheduling.
			 * Requests will then be executed in fifo, and schedule
			 * will ensure that dependencies are emitted in fifo.
			 * There will be still some reordering with existing
			 * requests, so if userspace lied about their
			 * dependencies that reordering may be visible.
			 */
304
			execlists->no_priolist = true;
305 306 307 308 309
			goto find_priolist;
		}
	}

	p->priority = prio;
310
	INIT_LIST_HEAD(&p->requests);
311
	rb_link_node(&p->node, rb, parent);
312
	rb_insert_color(&p->node, &execlists->queue);
313 314

	if (first)
315
		execlists->first = &p->node;
316

317
	return p;
318 319
}

320
static void unwind_wa_tail(struct i915_request *rq)
321 322 323 324 325
{
	rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES);
	assert_ring_tail_valid(rq->ring, rq->tail);
}

326
static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
327
{
328
	struct i915_request *rq, *rn;
329 330
	struct i915_priolist *uninitialized_var(p);
	int last_prio = I915_PRIORITY_INVALID;
331 332 333 334 335 336

	lockdep_assert_held(&engine->timeline->lock);

	list_for_each_entry_safe_reverse(rq, rn,
					 &engine->timeline->requests,
					 link) {
337
		if (i915_request_completed(rq))
338 339
			return;

340
		__i915_request_unsubmit(rq);
341 342
		unwind_wa_tail(rq);

343 344 345 346
		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
		if (rq_prio(rq) != last_prio) {
			last_prio = rq_prio(rq);
			p = lookup_priolist(engine, &rq->priotree, last_prio);
347 348 349
		}

		list_add(&rq->priotree.link, &p->requests);
350 351 352
	}
}

353
void
354 355 356 357 358 359 360 361 362 363
execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
{
	struct intel_engine_cs *engine =
		container_of(execlists, typeof(*engine), execlists);

	spin_lock_irq(&engine->timeline->lock);
	__unwind_incomplete_requests(engine);
	spin_unlock_irq(&engine->timeline->lock);
}

364
static inline void
365
execlists_context_status_change(struct i915_request *rq, unsigned long status)
366
{
367 368 369 370 371 372
	/*
	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
	 * The compiler should eliminate this function as dead-code.
	 */
	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
		return;
373

374 375
	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
				   status, rq);
376 377
}

378 379 380 381 382 383 384 385 386 387 388 389 390
inline void
execlists_user_begin(struct intel_engine_execlists *execlists,
		     const struct execlist_port *port)
{
	execlists_set_active_once(execlists, EXECLISTS_ACTIVE_USER);
}

inline void
execlists_user_end(struct intel_engine_execlists *execlists)
{
	execlists_clear_active(execlists, EXECLISTS_ACTIVE_USER);
}

391
static inline void
392
execlists_context_schedule_in(struct i915_request *rq)
393 394
{
	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
395
	intel_engine_context_in(rq->engine);
396 397 398
}

static inline void
399
execlists_context_schedule_out(struct i915_request *rq)
400
{
401
	intel_engine_context_out(rq->engine);
402 403 404
	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
}

405 406 407 408 409 410 411 412 413
static void
execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
{
	ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
	ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
	ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
	ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
}

414
static u64 execlists_update_context(struct i915_request *rq)
415
{
416
	struct intel_context *ce = &rq->ctx->engine[rq->engine->id];
417 418
	struct i915_hw_ppgtt *ppgtt =
		rq->ctx->ppgtt ?: rq->i915->mm.aliasing_ppgtt;
419
	u32 *reg_state = ce->lrc_reg_state;
420

421
	reg_state[CTX_RING_TAIL+1] = intel_ring_set_tail(rq->ring, rq->tail);
422

423 424 425 426 427
	/* True 32b PPGTT with dynamic page allocation: update PDP
	 * registers and point the unallocated PDPs to scratch page.
	 * PML4 is allocated during ppgtt init, so this is not needed
	 * in 48-bit mode.
	 */
428
	if (ppgtt && !i915_vm_is_48bit(&ppgtt->base))
429
		execlists_update_context_pdps(ppgtt, reg_state);
430 431

	return ce->lrc_desc;
432 433
}

434
static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
C
Chris Wilson 已提交
435
{
436 437 438 439 440 441 442
	if (execlists->ctrl_reg) {
		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
	} else {
		writel(upper_32_bits(desc), execlists->submit_reg);
		writel(lower_32_bits(desc), execlists->submit_reg);
	}
C
Chris Wilson 已提交
443 444
}

445
static void execlists_submit_ports(struct intel_engine_cs *engine)
446
{
447 448
	struct intel_engine_execlists *execlists = &engine->execlists;
	struct execlist_port *port = execlists->port;
449
	unsigned int n;
450

451 452 453 454 455 456 457
	/*
	 * ELSQ note: the submit queue is not cleared after being submitted
	 * to the HW so we need to make sure we always clean it up. This is
	 * currently ensured by the fact that we always write the same number
	 * of elsq entries, keep this in mind before changing the loop below.
	 */
	for (n = execlists_num_ports(execlists); n--; ) {
458
		struct i915_request *rq;
459 460 461 462 463 464 465
		unsigned int count;
		u64 desc;

		rq = port_unpack(&port[n], &count);
		if (rq) {
			GEM_BUG_ON(count > !n);
			if (!count++)
466
				execlists_context_schedule_in(rq);
467 468 469
			port_set(&port[n], port_pack(rq, count));
			desc = execlists_update_context(rq);
			GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
470

471
			GEM_TRACE("%s in[%d]:  ctx=%d.%d, global=%d (fence %llx:%d) (current %d), prio=%d\n",
472
				  engine->name, n,
473
				  port[n].context_id, count,
474
				  rq->global_seqno,
475
				  rq->fence.context, rq->fence.seqno,
476
				  intel_engine_get_seqno(engine),
477
				  rq_prio(rq));
478 479 480 481
		} else {
			GEM_BUG_ON(!n);
			desc = 0;
		}
482

483
		write_desc(execlists, desc, n);
484
	}
485 486 487 488 489 490

	/* we need to manually load the submit queue */
	if (execlists->ctrl_reg)
		writel(EL_CTRL_LOAD, execlists->ctrl_reg);

	execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
491 492
}

493
static bool ctx_single_port_submission(const struct i915_gem_context *ctx)
494
{
495
	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
496
		i915_gem_context_force_single_submission(ctx));
497
}
498

499 500 501 502 503
static bool can_merge_ctx(const struct i915_gem_context *prev,
			  const struct i915_gem_context *next)
{
	if (prev != next)
		return false;
504

505 506
	if (ctx_single_port_submission(prev))
		return false;
507

508
	return true;
509 510
}

511
static void port_assign(struct execlist_port *port, struct i915_request *rq)
512 513 514 515
{
	GEM_BUG_ON(rq == port_request(port));

	if (port_isset(port))
516
		i915_request_put(port_request(port));
517

518
	port_set(port, port_pack(i915_request_get(rq), port_count(port)));
519 520
}

C
Chris Wilson 已提交
521 522
static void inject_preempt_context(struct intel_engine_cs *engine)
{
523
	struct intel_engine_execlists *execlists = &engine->execlists;
C
Chris Wilson 已提交
524 525 526 527
	struct intel_context *ce =
		&engine->i915->preempt_context->engine[engine->id];
	unsigned int n;

528
	GEM_BUG_ON(execlists->preempt_complete_status !=
529
		   upper_32_bits(ce->lrc_desc));
530 531 532 533 534 535
	GEM_BUG_ON((ce->lrc_reg_state[CTX_CONTEXT_CONTROL + 1] &
		    _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
				       CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT)) !=
		   _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
				      CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT));

536 537 538 539
	/*
	 * Switch to our empty preempt context so
	 * the state of the GPU is known (idle).
	 */
540
	GEM_TRACE("%s\n", engine->name);
541 542 543 544 545 546 547 548
	for (n = execlists_num_ports(execlists); --n; )
		write_desc(execlists, 0, n);

	write_desc(execlists, ce->lrc_desc, n);

	/* we need to manually load the submit queue */
	if (execlists->ctrl_reg)
		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
C
Chris Wilson 已提交
549

550
	execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
551
	execlists_set_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT);
C
Chris Wilson 已提交
552 553
}

554
static void execlists_dequeue(struct intel_engine_cs *engine)
555
{
556 557
	struct intel_engine_execlists * const execlists = &engine->execlists;
	struct execlist_port *port = execlists->port;
558 559
	const struct execlist_port * const last_port =
		&execlists->port[execlists->port_mask];
560
	struct i915_request *last = port_request(port);
561
	struct rb_node *rb;
562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582
	bool submit = false;

	/* Hardware submission is through 2 ports. Conceptually each port
	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
	 * static for a context, and unique to each, so we only execute
	 * requests belonging to a single context from each ring. RING_HEAD
	 * is maintained by the CS in the context image, it marks the place
	 * where it got up to last time, and through RING_TAIL we tell the CS
	 * where we want to execute up to this time.
	 *
	 * In this list the requests are in order of execution. Consecutive
	 * requests from the same context are adjacent in the ringbuffer. We
	 * can combine these requests into a single RING_TAIL update:
	 *
	 *              RING_HEAD...req1...req2
	 *                                    ^- RING_TAIL
	 * since to execute req2 the CS must first execute req1.
	 *
	 * Our goal then is to point each port to the end of a consecutive
	 * sequence of requests as being the most optimal (fewest wake ups
	 * and context switches) submission.
583
	 */
584

585
	spin_lock_irq(&engine->timeline->lock);
586 587
	rb = execlists->first;
	GEM_BUG_ON(rb_first(&execlists->queue) != rb);
C
Chris Wilson 已提交
588 589 590 591 592 593 594 595

	if (last) {
		/*
		 * Don't resubmit or switch until all outstanding
		 * preemptions (lite-restore) are seen. Then we
		 * know the next preemption status we see corresponds
		 * to this ELSP update.
		 */
596 597
		GEM_BUG_ON(!execlists_is_active(execlists,
						EXECLISTS_ACTIVE_USER));
598
		GEM_BUG_ON(!port_count(&port[0]));
C
Chris Wilson 已提交
599 600 601
		if (port_count(&port[0]) > 1)
			goto unlock;

602 603 604 605 606 607 608 609 610 611
		/*
		 * If we write to ELSP a second time before the HW has had
		 * a chance to respond to the previous write, we can confuse
		 * the HW and hit "undefined behaviour". After writing to ELSP,
		 * we must then wait until we see a context-switch event from
		 * the HW to indicate that it has had a chance to respond.
		 */
		if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
			goto unlock;

612
		if (need_preempt(engine, last, execlists->queue_priority)) {
C
Chris Wilson 已提交
613 614 615
			inject_preempt_context(engine);
			goto unlock;
		}
616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649

		/*
		 * In theory, we could coalesce more requests onto
		 * the second port (the first port is active, with
		 * no preemptions pending). However, that means we
		 * then have to deal with the possible lite-restore
		 * of the second port (as we submit the ELSP, there
		 * may be a context-switch) but also we may complete
		 * the resubmission before the context-switch. Ergo,
		 * coalescing onto the second port will cause a
		 * preemption event, but we cannot predict whether
		 * that will affect port[0] or port[1].
		 *
		 * If the second port is already active, we can wait
		 * until the next context-switch before contemplating
		 * new requests. The GPU will be busy and we should be
		 * able to resubmit the new ELSP before it idles,
		 * avoiding pipeline bubbles (momentary pauses where
		 * the driver is unable to keep up the supply of new
		 * work). However, we have to double check that the
		 * priorities of the ports haven't been switch.
		 */
		if (port_count(&port[1]))
			goto unlock;

		/*
		 * WaIdleLiteRestore:bdw,skl
		 * Apply the wa NOOPs to prevent
		 * ring:HEAD == rq:TAIL as we resubmit the
		 * request. See gen8_emit_breadcrumb() for
		 * where we prepare the padding after the
		 * end of the request.
		 */
		last->tail = last->wa_tail;
C
Chris Wilson 已提交
650 651
	}

652 653
	while (rb) {
		struct i915_priolist *p = to_priolist(rb);
654
		struct i915_request *rq, *rn;
655 656 657 658 659 660 661 662 663 664 665 666

		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
			/*
			 * Can we combine this request with the current port?
			 * It has to be the same context/ringbuffer and not
			 * have any exceptions (e.g. GVT saying never to
			 * combine contexts).
			 *
			 * If we can combine the requests, we can execute both
			 * by updating the RING_TAIL to point to the end of the
			 * second request, and so we never need to tell the
			 * hardware about the first.
667
			 */
668 669 670 671 672 673
			if (last && !can_merge_ctx(rq->ctx, last->ctx)) {
				/*
				 * If we are on the second port and cannot
				 * combine this request with the last, then we
				 * are done.
				 */
674
				if (port == last_port) {
675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698
					__list_del_many(&p->requests,
							&rq->priotree.link);
					goto done;
				}

				/*
				 * If GVT overrides us we only ever submit
				 * port[0], leaving port[1] empty. Note that we
				 * also have to be careful that we don't queue
				 * the same context (even though a different
				 * request) to the second port.
				 */
				if (ctx_single_port_submission(last->ctx) ||
				    ctx_single_port_submission(rq->ctx)) {
					__list_del_many(&p->requests,
							&rq->priotree.link);
					goto done;
				}

				GEM_BUG_ON(last->ctx == rq->ctx);

				if (submit)
					port_assign(port, last);
				port++;
699 700

				GEM_BUG_ON(port_isset(port));
701
			}
702

703
			INIT_LIST_HEAD(&rq->priotree.link);
704 705
			__i915_request_submit(rq);
			trace_i915_request_in(rq, port_index(port, execlists));
706 707
			last = rq;
			submit = true;
708
		}
709

710
		rb = rb_next(rb);
711
		rb_erase(&p->node, &execlists->queue);
712 713
		INIT_LIST_HEAD(&p->requests);
		if (p->priority != I915_PRIORITY_NORMAL)
714
			kmem_cache_free(engine->i915->priorities, p);
715
	}
716
done:
717
	execlists->queue_priority = rb ? to_priolist(rb)->priority : INT_MIN;
718
	execlists->first = rb;
719
	if (submit)
720
		port_assign(port, last);
721 722 723 724

	/* We must always keep the beast fed if we have work piled up */
	GEM_BUG_ON(execlists->first && !port_isset(execlists->port));

C
Chris Wilson 已提交
725
unlock:
726
	spin_unlock_irq(&engine->timeline->lock);
727

728
	if (submit) {
729
		execlists_user_begin(execlists, execlists->port);
730
		execlists_submit_ports(engine);
731
	}
732 733 734

	GEM_BUG_ON(port_isset(execlists->port) &&
		   !execlists_is_active(execlists, EXECLISTS_ACTIVE_USER));
735 736
}

737
void
738
execlists_cancel_port_requests(struct intel_engine_execlists * const execlists)
739
{
740
	struct execlist_port *port = execlists->port;
741
	unsigned int num_ports = execlists_num_ports(execlists);
742

743
	while (num_ports-- && port_isset(port)) {
744
		struct i915_request *rq = port_request(port);
745

746 747 748 749 750 751 752
		GEM_TRACE("%s:port%u global=%d (fence %llx:%d), (current %d)\n",
			  rq->engine->name,
			  (unsigned int)(port - execlists->port),
			  rq->global_seqno,
			  rq->fence.context, rq->fence.seqno,
			  intel_engine_get_seqno(rq->engine));

753
		GEM_BUG_ON(!execlists->active);
754
		intel_engine_context_out(rq->engine);
755 756 757 758 759 760

		execlists_context_status_change(rq,
						i915_request_completed(rq) ?
						INTEL_CONTEXT_SCHEDULE_OUT :
						INTEL_CONTEXT_SCHEDULE_PREEMPTED);

761
		i915_request_put(rq);
762

763 764 765
		memset(port, 0, sizeof(*port));
		port++;
	}
766

767
	execlists_user_end(execlists);
768 769
}

770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820
static void clear_gtiir(struct intel_engine_cs *engine)
{
	static const u8 gtiir[] = {
		[RCS]  = 0,
		[BCS]  = 0,
		[VCS]  = 1,
		[VCS2] = 1,
		[VECS] = 3,
	};
	struct drm_i915_private *dev_priv = engine->i915;
	int i;

	/* TODO: correctly reset irqs for gen11 */
	if (WARN_ON_ONCE(INTEL_GEN(engine->i915) >= 11))
		return;

	GEM_BUG_ON(engine->id >= ARRAY_SIZE(gtiir));

	/*
	 * Clear any pending interrupt state.
	 *
	 * We do it twice out of paranoia that some of the IIR are
	 * double buffered, and so if we only reset it once there may
	 * still be an interrupt pending.
	 */
	for (i = 0; i < 2; i++) {
		I915_WRITE(GEN8_GT_IIR(gtiir[engine->id]),
			   engine->irq_keep_mask);
		POSTING_READ(GEN8_GT_IIR(gtiir[engine->id]));
	}
	GEM_BUG_ON(I915_READ(GEN8_GT_IIR(gtiir[engine->id])) &
		   engine->irq_keep_mask);
}

static void reset_irq(struct intel_engine_cs *engine)
{
	/* Mark all CS interrupts as complete */
	smp_store_mb(engine->execlists.active, 0);
	synchronize_hardirq(engine->i915->drm.irq);

	clear_gtiir(engine);

	/*
	 * The port is checked prior to scheduling a tasklet, but
	 * just in case we have suspended the tasklet to do the
	 * wedging make sure that when it wakes, it decides there
	 * is no work to do by clearing the irq_posted bit.
	 */
	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
}

821 822
static void execlists_cancel_requests(struct intel_engine_cs *engine)
{
823
	struct intel_engine_execlists * const execlists = &engine->execlists;
824
	struct i915_request *rq, *rn;
825 826 827
	struct rb_node *rb;
	unsigned long flags;

828 829
	GEM_TRACE("%s current %d\n",
		  engine->name, intel_engine_get_seqno(engine));
830

831 832 833 834 835 836 837 838 839 840 841 842 843 844 845
	/*
	 * Before we call engine->cancel_requests(), we should have exclusive
	 * access to the submission state. This is arranged for us by the
	 * caller disabling the interrupt generation, the tasklet and other
	 * threads that may then access the same state, giving us a free hand
	 * to reset state. However, we still need to let lockdep be aware that
	 * we know this state may be accessed in hardirq context, so we
	 * disable the irq around this manipulation and we want to keep
	 * the spinlock focused on its duties and not accidentally conflate
	 * coverage to the submission's irq state. (Similarly, although we
	 * shouldn't need to disable irq around the manipulation of the
	 * submission's irq state, we also wish to remind ourselves that
	 * it is irq state.)
	 */
	local_irq_save(flags);
846 847

	/* Cancel the requests on the HW and clear the ELSP tracker. */
848
	execlists_cancel_port_requests(execlists);
849
	reset_irq(engine);
850

851 852
	spin_lock(&engine->timeline->lock);

853 854 855
	/* Mark all executing requests as skipped. */
	list_for_each_entry(rq, &engine->timeline->requests, link) {
		GEM_BUG_ON(!rq->global_seqno);
856
		if (!i915_request_completed(rq))
857 858 859 860
			dma_fence_set_error(&rq->fence, -EIO);
	}

	/* Flush the queued requests to the timeline list (for retiring). */
861
	rb = execlists->first;
862
	while (rb) {
863
		struct i915_priolist *p = to_priolist(rb);
864 865 866 867 868

		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
			INIT_LIST_HEAD(&rq->priotree.link);

			dma_fence_set_error(&rq->fence, -EIO);
869
			__i915_request_submit(rq);
870 871 872
		}

		rb = rb_next(rb);
873
		rb_erase(&p->node, &execlists->queue);
874 875 876 877 878 879 880
		INIT_LIST_HEAD(&p->requests);
		if (p->priority != I915_PRIORITY_NORMAL)
			kmem_cache_free(engine->i915->priorities, p);
	}

	/* Remaining _unready_ requests will be nop'ed when submitted */

881
	execlists->queue_priority = INT_MIN;
882 883
	execlists->queue = RB_ROOT;
	execlists->first = NULL;
884
	GEM_BUG_ON(port_isset(execlists->port));
885

886 887 888
	spin_unlock(&engine->timeline->lock);

	local_irq_restore(flags);
889 890
}

891
/*
892 893 894
 * Check the unread Context Status Buffers and manage the submission of new
 * contexts to the ELSP accordingly.
 */
895
static void execlists_submission_tasklet(unsigned long data)
896
{
897 898
	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
	struct intel_engine_execlists * const execlists = &engine->execlists;
899
	struct execlist_port *port = execlists->port;
900
	struct drm_i915_private *dev_priv = engine->i915;
901
	bool fw = false;
902

903 904
	/*
	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
905 906 907 908 909 910 911 912
	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
	 * not be relinquished until the device is idle (see
	 * i915_gem_idle_work_handler()). As a precaution, we make sure
	 * that all ELSP are drained i.e. we have processed the CSB,
	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
	 */
	GEM_BUG_ON(!dev_priv->gt.awake);

913 914
	/*
	 * Prefer doing test_and_clear_bit() as a two stage operation to avoid
915 916 917 918
	 * imposing the cost of a locked atomic transaction when submitting a
	 * new request (outside of the context-switch interrupt).
	 */
	while (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted)) {
919 920 921
		/* The HWSP contains a (cacheable) mirror of the CSB */
		const u32 *buf =
			&engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
922
		unsigned int head, tail;
923

924
		if (unlikely(execlists->csb_use_mmio)) {
925 926
			buf = (u32 * __force)
				(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
927
			execlists->csb_head = -1; /* force mmio read of CSB ptrs */
928 929
		}

930 931 932 933
		/* Clear before reading to catch new interrupts */
		clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
		smp_mb__after_atomic();

934
		if (unlikely(execlists->csb_head == -1)) { /* following a reset */
935 936 937 938 939 940
			if (!fw) {
				intel_uncore_forcewake_get(dev_priv,
							   execlists->fw_domains);
				fw = true;
			}

941 942 943
			head = readl(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
			tail = GEN8_CSB_WRITE_PTR(head);
			head = GEN8_CSB_READ_PTR(head);
944
			execlists->csb_head = head;
945 946 947 948 949
		} else {
			const int write_idx =
				intel_hws_csb_write_index(dev_priv) -
				I915_HWS_CSB_BUF0_INDEX;

950
			head = execlists->csb_head;
951 952
			tail = READ_ONCE(buf[write_idx]);
		}
953
		GEM_TRACE("%s cs-irq head=%d [%d%s], tail=%d [%d%s]\n",
954
			  engine->name,
955 956
			  head, GEN8_CSB_READ_PTR(readl(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?",
			  tail, GEN8_CSB_WRITE_PTR(readl(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?");
957

958
		while (head != tail) {
959
			struct i915_request *rq;
960
			unsigned int status;
961
			unsigned int count;
962 963 964

			if (++head == GEN8_CSB_ENTRIES)
				head = 0;
965

966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982
			/* We are flying near dragons again.
			 *
			 * We hold a reference to the request in execlist_port[]
			 * but no more than that. We are operating in softirq
			 * context and so cannot hold any mutex or sleep. That
			 * prevents us stopping the requests we are processing
			 * in port[] from being retired simultaneously (the
			 * breadcrumb will be complete before we see the
			 * context-switch). As we only hold the reference to the
			 * request, any pointer chasing underneath the request
			 * is subject to a potential use-after-free. Thus we
			 * store all of the bookkeeping within port[] as
			 * required, and avoid using unguarded pointers beneath
			 * request itself. The same applies to the atomic
			 * status notifier.
			 */

983
			status = READ_ONCE(buf[2 * head]); /* maybe mmio! */
984
			GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n",
985
				  engine->name, head,
986 987
				  status, buf[2*head + 1],
				  execlists->active);
988 989 990 991 992 993 994 995 996

			if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
				      GEN8_CTX_STATUS_PREEMPTED))
				execlists_set_active(execlists,
						     EXECLISTS_ACTIVE_HWACK);
			if (status & GEN8_CTX_STATUS_ACTIVE_IDLE)
				execlists_clear_active(execlists,
						       EXECLISTS_ACTIVE_HWACK);

997 998 999
			if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
				continue;

1000 1001 1002
			/* We should never get a COMPLETED | IDLE_ACTIVE! */
			GEM_BUG_ON(status & GEN8_CTX_STATUS_IDLE_ACTIVE);

1003
			if (status & GEN8_CTX_STATUS_COMPLETE &&
1004
			    buf[2*head + 1] == execlists->preempt_complete_status) {
1005 1006
				GEM_TRACE("%s preempt-idle\n", engine->name);

1007 1008
				execlists_cancel_port_requests(execlists);
				execlists_unwind_incomplete_requests(execlists);
C
Chris Wilson 已提交
1009

1010 1011 1012 1013
				GEM_BUG_ON(!execlists_is_active(execlists,
								EXECLISTS_ACTIVE_PREEMPT));
				execlists_clear_active(execlists,
						       EXECLISTS_ACTIVE_PREEMPT);
C
Chris Wilson 已提交
1014 1015 1016 1017
				continue;
			}

			if (status & GEN8_CTX_STATUS_PREEMPTED &&
1018 1019
			    execlists_is_active(execlists,
						EXECLISTS_ACTIVE_PREEMPT))
C
Chris Wilson 已提交
1020 1021
				continue;

1022 1023 1024
			GEM_BUG_ON(!execlists_is_active(execlists,
							EXECLISTS_ACTIVE_USER));

1025
			rq = port_unpack(port, &count);
1026
			GEM_TRACE("%s out[0]: ctx=%d.%d, global=%d (fence %llx:%d) (current %d), prio=%d\n",
1027
				  engine->name,
1028
				  port->context_id, count,
1029
				  rq ? rq->global_seqno : 0,
1030 1031
				  rq ? rq->fence.context : 0,
				  rq ? rq->fence.seqno : 0,
1032
				  intel_engine_get_seqno(engine),
1033
				  rq ? rq_prio(rq) : 0);
1034 1035 1036 1037

			/* Check the context/desc id for this event matches */
			GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);

1038 1039
			GEM_BUG_ON(count == 0);
			if (--count == 0) {
1040 1041 1042 1043 1044 1045 1046 1047
				/*
				 * On the final event corresponding to the
				 * submission of this context, we expect either
				 * an element-switch event or a completion
				 * event (and on completion, the active-idle
				 * marker). No more preemptions, lite-restore
				 * or otherwise.
				 */
1048
				GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
1049 1050
				GEM_BUG_ON(port_isset(&port[1]) &&
					   !(status & GEN8_CTX_STATUS_ELEMENT_SWITCH));
1051 1052 1053 1054 1055 1056 1057 1058 1059
				GEM_BUG_ON(!port_isset(&port[1]) &&
					   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));

				/*
				 * We rely on the hardware being strongly
				 * ordered, that the breadcrumb write is
				 * coherent (visible from the CPU) before the
				 * user interrupt and CSB is processed.
				 */
1060
				GEM_BUG_ON(!i915_request_completed(rq));
1061

1062
				execlists_context_schedule_out(rq);
1063 1064
				trace_i915_request_out(rq);
				i915_request_put(rq);
1065

1066 1067 1068
				GEM_TRACE("%s completed ctx=%d\n",
					  engine->name, port->context_id);

1069 1070 1071 1072 1073
				port = execlists_port_complete(execlists, port);
				if (port_isset(port))
					execlists_user_begin(execlists, port);
				else
					execlists_user_end(execlists);
1074 1075
			} else {
				port_set(port, port_pack(rq, count));
1076
			}
1077
		}
1078

1079 1080
		if (head != execlists->csb_head) {
			execlists->csb_head = head;
1081 1082 1083
			writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
			       dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
		}
1084 1085
	}

1086
	if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT))
1087
		execlists_dequeue(engine);
1088

1089 1090
	if (fw)
		intel_uncore_forcewake_put(dev_priv, execlists->fw_domains);
1091 1092 1093 1094 1095

	/* If the engine is now idle, so should be the flag; and vice versa. */
	GEM_BUG_ON(execlists_is_active(&engine->execlists,
				       EXECLISTS_ACTIVE_USER) ==
		   !port_isset(engine->execlists.port));
1096 1097
}

1098 1099 1100
static void queue_request(struct intel_engine_cs *engine,
			  struct i915_priotree *pt,
			  int prio)
1101
{
1102 1103
	list_add_tail(&pt->link, &lookup_priolist(engine, pt, prio)->requests);
}
1104

1105 1106 1107 1108 1109 1110
static void __submit_queue(struct intel_engine_cs *engine, int prio)
{
	engine->execlists.queue_priority = prio;
	tasklet_hi_schedule(&engine->execlists.tasklet);
}

1111 1112
static void submit_queue(struct intel_engine_cs *engine, int prio)
{
1113 1114
	if (prio > engine->execlists.queue_priority)
		__submit_queue(engine, prio);
1115 1116
}

1117
static void execlists_submit_request(struct i915_request *request)
1118
{
1119
	struct intel_engine_cs *engine = request->engine;
1120
	unsigned long flags;
1121

1122 1123
	/* Will be called from irq-context when using foreign fences. */
	spin_lock_irqsave(&engine->timeline->lock, flags);
1124

1125 1126
	queue_request(engine, &request->priotree, rq_prio(request));
	submit_queue(engine, rq_prio(request));
1127

1128
	GEM_BUG_ON(!engine->execlists.first);
1129 1130
	GEM_BUG_ON(list_empty(&request->priotree.link));

1131
	spin_unlock_irqrestore(&engine->timeline->lock, flags);
1132 1133
}

1134
static struct i915_request *pt_to_request(struct i915_priotree *pt)
1135
{
1136
	return container_of(pt, struct i915_request, priotree);
1137 1138
}

1139 1140 1141
static struct intel_engine_cs *
pt_lock_engine(struct i915_priotree *pt, struct intel_engine_cs *locked)
{
1142
	struct intel_engine_cs *engine = pt_to_request(pt)->engine;
1143 1144

	GEM_BUG_ON(!locked);
1145 1146

	if (engine != locked) {
1147 1148
		spin_unlock(&locked->timeline->lock);
		spin_lock(&engine->timeline->lock);
1149 1150 1151 1152 1153
	}

	return engine;
}

1154
static void execlists_schedule(struct i915_request *request, int prio)
1155
{
1156
	struct intel_engine_cs *engine;
1157 1158 1159 1160
	struct i915_dependency *dep, *p;
	struct i915_dependency stack;
	LIST_HEAD(dfs);

1161 1162
	GEM_BUG_ON(prio == I915_PRIORITY_INVALID);

1163
	if (i915_request_completed(request))
1164 1165
		return;

1166 1167 1168
	if (prio <= READ_ONCE(request->priotree.priority))
		return;

1169 1170
	/* Need BKL in order to use the temporary link inside i915_dependency */
	lockdep_assert_held(&request->i915->drm.struct_mutex);
1171 1172 1173 1174

	stack.signaler = &request->priotree;
	list_add(&stack.dfs_link, &dfs);

1175 1176
	/*
	 * Recursively bump all dependent priorities to match the new request.
1177 1178 1179 1180 1181
	 *
	 * A naive approach would be to use recursion:
	 * static void update_priorities(struct i915_priotree *pt, prio) {
	 *	list_for_each_entry(dep, &pt->signalers_list, signal_link)
	 *		update_priorities(dep->signal, prio)
1182
	 *	queue_request(pt);
1183 1184 1185 1186 1187 1188 1189 1190 1191 1192
	 * }
	 * but that may have unlimited recursion depth and so runs a very
	 * real risk of overunning the kernel stack. Instead, we build
	 * a flat list of all dependencies starting with the current request.
	 * As we walk the list of dependencies, we add all of its dependencies
	 * to the end of the list (this may include an already visited
	 * request) and continue to walk onwards onto the new dependencies. The
	 * end result is a topological list of requests in reverse order, the
	 * last element in the list is the request we must execute first.
	 */
1193
	list_for_each_entry(dep, &dfs, dfs_link) {
1194 1195
		struct i915_priotree *pt = dep->signaler;

1196 1197
		/*
		 * Within an engine, there can be no cycle, but we may
1198 1199 1200 1201 1202
		 * refer to the same dependency chain multiple times
		 * (redundant dependencies are not eliminated) and across
		 * engines.
		 */
		list_for_each_entry(p, &pt->signalers_list, signal_link) {
1203 1204
			GEM_BUG_ON(p == dep); /* no cycles! */

1205
			if (i915_priotree_signaled(p->signaler))
1206 1207
				continue;

1208
			GEM_BUG_ON(p->signaler->priority < pt->priority);
1209 1210
			if (prio > READ_ONCE(p->signaler->priority))
				list_move_tail(&p->dfs_link, &dfs);
1211
		}
1212 1213
	}

1214 1215
	/*
	 * If we didn't need to bump any existing priorities, and we haven't
1216 1217 1218 1219
	 * yet submitted this request (i.e. there is no potential race with
	 * execlists_submit_request()), we can set our own priority and skip
	 * acquiring the engine locks.
	 */
1220
	if (request->priotree.priority == I915_PRIORITY_INVALID) {
1221 1222 1223 1224 1225 1226 1227
		GEM_BUG_ON(!list_empty(&request->priotree.link));
		request->priotree.priority = prio;
		if (stack.dfs_link.next == stack.dfs_link.prev)
			return;
		__list_del_entry(&stack.dfs_link);
	}

1228 1229 1230
	engine = request->engine;
	spin_lock_irq(&engine->timeline->lock);

1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242
	/* Fifo and depth-first replacement ensure our deps execute before us */
	list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
		struct i915_priotree *pt = dep->signaler;

		INIT_LIST_HEAD(&dep->dfs_link);

		engine = pt_lock_engine(pt, engine);

		if (prio <= pt->priority)
			continue;

		pt->priority = prio;
1243 1244
		if (!list_empty(&pt->link)) {
			__list_del_entry(&pt->link);
1245
			queue_request(engine, pt, prio);
1246
		}
1247 1248 1249 1250

		if (prio > engine->execlists.queue_priority &&
		    i915_sw_fence_done(&pt_to_request(pt)->submit))
			__submit_queue(engine, prio);
1251 1252
	}

1253
	spin_unlock_irq(&engine->timeline->lock);
1254 1255
}

1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278
static int __context_pin(struct i915_gem_context *ctx, struct i915_vma *vma)
{
	unsigned int flags;
	int err;

	/*
	 * Clear this page out of any CPU caches for coherent swap-in/out.
	 * We only want to do this on the first bind so that we do not stall
	 * on an active context (which by nature is already on the GPU).
	 */
	if (!(vma->flags & I915_VMA_GLOBAL_BIND)) {
		err = i915_gem_object_set_to_gtt_domain(vma->obj, true);
		if (err)
			return err;
	}

	flags = PIN_GLOBAL | PIN_HIGH;
	if (ctx->ggtt_offset_bias)
		flags |= PIN_OFFSET_BIAS | ctx->ggtt_offset_bias;

	return i915_vma_pin(vma, 0, GEN8_LR_CONTEXT_ALIGN, flags);
}

1279 1280 1281
static struct intel_ring *
execlists_context_pin(struct intel_engine_cs *engine,
		      struct i915_gem_context *ctx)
1282
{
1283
	struct intel_context *ce = &ctx->engine[engine->id];
1284
	void *vaddr;
1285
	int ret;
1286

1287
	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
1288

1289 1290
	if (likely(ce->pin_count++))
		goto out;
1291
	GEM_BUG_ON(!ce->pin_count); /* no overflow please! */
1292

1293 1294 1295
	ret = execlists_context_deferred_alloc(ctx, engine);
	if (ret)
		goto err;
1296
	GEM_BUG_ON(!ce->state);
1297

1298
	ret = __context_pin(ctx, ce->state);
1299
	if (ret)
1300
		goto err;
1301

1302
	vaddr = i915_gem_object_pin_map(ce->state->obj, I915_MAP_WB);
1303 1304
	if (IS_ERR(vaddr)) {
		ret = PTR_ERR(vaddr);
1305
		goto unpin_vma;
1306 1307
	}

1308
	ret = intel_ring_pin(ce->ring, ctx->i915, ctx->ggtt_offset_bias);
1309
	if (ret)
1310
		goto unpin_map;
1311

1312
	intel_lr_context_descriptor_update(ctx, engine);
1313

1314 1315
	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
	ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
1316
		i915_ggtt_offset(ce->ring->vma);
1317
	ce->lrc_reg_state[CTX_RING_HEAD+1] = ce->ring->head;
1318

1319
	ce->state->obj->pin_global++;
1320
	i915_gem_context_get(ctx);
1321 1322
out:
	return ce->ring;
1323

1324
unpin_map:
1325 1326 1327
	i915_gem_object_unpin_map(ce->state->obj);
unpin_vma:
	__i915_vma_unpin(ce->state);
1328
err:
1329
	ce->pin_count = 0;
1330
	return ERR_PTR(ret);
1331 1332
}

1333 1334
static void execlists_context_unpin(struct intel_engine_cs *engine,
				    struct i915_gem_context *ctx)
1335
{
1336
	struct intel_context *ce = &ctx->engine[engine->id];
1337

1338
	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
1339
	GEM_BUG_ON(ce->pin_count == 0);
1340

1341
	if (--ce->pin_count)
1342
		return;
1343

1344
	intel_ring_unpin(ce->ring);
1345

1346
	ce->state->obj->pin_global--;
1347 1348
	i915_gem_object_unpin_map(ce->state->obj);
	i915_vma_unpin(ce->state);
1349

1350
	i915_gem_context_put(ctx);
1351 1352
}

1353
static int execlists_request_alloc(struct i915_request *request)
1354 1355 1356
{
	struct intel_engine_cs *engine = request->engine;
	struct intel_context *ce = &request->ctx->engine[engine->id];
1357
	int ret;
1358

1359 1360
	GEM_BUG_ON(!ce->pin_count);

1361 1362 1363 1364 1365 1366
	/* Flush enough space to reduce the likelihood of waiting after
	 * we start building the request - in which case we will just
	 * have to repeat work.
	 */
	request->reserved_space += EXECLISTS_REQUEST_SIZE;

1367 1368 1369
	ret = intel_ring_wait_for_space(request->ring, request->reserved_space);
	if (ret)
		return ret;
1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381

	/* Note that after this point, we have committed to using
	 * this request as it is being used to both track the
	 * state of engine initialisation and liveness of the
	 * golden renderstate above. Think twice before you try
	 * to cancel/unwind this request now.
	 */

	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
	return 0;
}

1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397
/*
 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
 * but there is a slight complication as this is applied in WA batch where the
 * values are only initialized once so we cannot take register value at the
 * beginning and reuse it further; hence we save its value to memory, upload a
 * constant value with bit21 set and then we restore it back with the saved value.
 * To simplify the WA, a constant value is formed by using the default value
 * of this register. This shouldn't be a problem because we are only modifying
 * it for a short period and this batch in non-premptible. We can ofcourse
 * use additional instructions that read the actual value of the register
 * at that time and set our bit of interest but it makes the WA complicated.
 *
 * This WA is also required for Gen9 so extracting as a function avoids
 * code duplication.
 */
1398 1399
static u32 *
gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1400
{
1401 1402 1403 1404 1405 1406 1407 1408 1409
	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
	*batch++ = i915_ggtt_offset(engine->scratch) + 256;
	*batch++ = 0;

	*batch++ = MI_LOAD_REGISTER_IMM(1);
	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;

1410 1411 1412 1413
	batch = gen8_emit_pipe_control(batch,
				       PIPE_CONTROL_CS_STALL |
				       PIPE_CONTROL_DC_FLUSH_ENABLE,
				       0);
1414 1415 1416 1417 1418 1419 1420

	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
	*batch++ = i915_ggtt_offset(engine->scratch) + 256;
	*batch++ = 0;

	return batch;
1421 1422
}

1423 1424 1425 1426 1427 1428
/*
 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
 * initialized at the beginning and shared across all contexts but this field
 * helps us to have multiple batches at different offsets and select them based
 * on a criteria. At the moment this batch always start at the beginning of the page
 * and at this point we don't have multiple wa_ctx batch buffers.
1429
 *
1430 1431
 * The number of WA applied are not known at the beginning; we use this field
 * to return the no of DWORDS written.
1432
 *
1433 1434 1435 1436
 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
 * so it adds NOOPs as padding to make it cacheline aligned.
 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
 * makes a complete batch buffer.
1437
 */
1438
static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1439
{
1440
	/* WaDisableCtxRestoreArbitration:bdw,chv */
1441
	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1442

1443
	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1444 1445
	if (IS_BROADWELL(engine->i915))
		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1446

1447 1448
	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
	/* Actual scratch location is at 128 bytes offset */
1449 1450 1451 1452 1453 1454 1455
	batch = gen8_emit_pipe_control(batch,
				       PIPE_CONTROL_FLUSH_L3 |
				       PIPE_CONTROL_GLOBAL_GTT_IVB |
				       PIPE_CONTROL_CS_STALL |
				       PIPE_CONTROL_QW_WRITE,
				       i915_ggtt_offset(engine->scratch) +
				       2 * CACHELINE_BYTES);
1456

C
Chris Wilson 已提交
1457 1458
	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;

1459
	/* Pad to end of cacheline */
1460 1461
	while ((unsigned long)batch % CACHELINE_BYTES)
		*batch++ = MI_NOOP;
1462 1463 1464 1465 1466 1467 1468

	/*
	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
	 * execution depends on the length specified in terms of cache lines
	 * in the register CTX_RCS_INDIRECT_CTX
	 */

1469
	return batch;
1470 1471
}

1472
static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1473
{
C
Chris Wilson 已提交
1474 1475
	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;

1476
	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1477
	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1478

1479
	/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1480 1481 1482 1483 1484
	*batch++ = MI_LOAD_REGISTER_IMM(1);
	*batch++ = i915_mmio_reg_offset(COMMON_SLICE_CHICKEN2);
	*batch++ = _MASKED_BIT_DISABLE(
			GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE);
	*batch++ = MI_NOOP;
1485

1486 1487
	/* WaClearSlmSpaceAtContextSwitch:kbl */
	/* Actual scratch location is at 128 bytes offset */
1488
	if (IS_KBL_REVID(engine->i915, 0, KBL_REVID_A0)) {
1489 1490 1491 1492 1493 1494 1495
		batch = gen8_emit_pipe_control(batch,
					       PIPE_CONTROL_FLUSH_L3 |
					       PIPE_CONTROL_GLOBAL_GTT_IVB |
					       PIPE_CONTROL_CS_STALL |
					       PIPE_CONTROL_QW_WRITE,
					       i915_ggtt_offset(engine->scratch)
					       + 2 * CACHELINE_BYTES);
1496
	}
1497

1498
	/* WaMediaPoolStateCmdInWABB:bxt,glk */
1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512
	if (HAS_POOLED_EU(engine->i915)) {
		/*
		 * EU pool configuration is setup along with golden context
		 * during context initialization. This value depends on
		 * device type (2x6 or 3x6) and needs to be updated based
		 * on which subslice is disabled especially for 2x6
		 * devices, however it is safe to load default
		 * configuration of 3x6 device instead of masking off
		 * corresponding bits because HW ignores bits of a disabled
		 * subslice and drops down to appropriate config. Please
		 * see render_state_setup() in i915_gem_render_state.c for
		 * possible configurations, to avoid duplication they are
		 * not shown here again.
		 */
1513 1514 1515 1516 1517 1518
		*batch++ = GEN9_MEDIA_POOL_STATE;
		*batch++ = GEN9_MEDIA_POOL_ENABLE;
		*batch++ = 0x00777000;
		*batch++ = 0;
		*batch++ = 0;
		*batch++ = 0;
1519 1520
	}

C
Chris Wilson 已提交
1521 1522
	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;

1523
	/* Pad to end of cacheline */
1524 1525
	while ((unsigned long)batch % CACHELINE_BYTES)
		*batch++ = MI_NOOP;
1526

1527
	return batch;
1528 1529
}

1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563
static u32 *
gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
{
	int i;

	/*
	 * WaPipeControlBefore3DStateSamplePattern: cnl
	 *
	 * Ensure the engine is idle prior to programming a
	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
	 */
	batch = gen8_emit_pipe_control(batch,
				       PIPE_CONTROL_CS_STALL,
				       0);
	/*
	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
	 * confusing. Since gen8_emit_pipe_control() already advances the
	 * batch by 6 dwords, we advance the other 10 here, completing a
	 * cacheline. It's not clear if the workaround requires this padding
	 * before other commands, or if it's just the regular padding we would
	 * already have for the workaround bb, so leave it here for now.
	 */
	for (i = 0; i < 10; i++)
		*batch++ = MI_NOOP;

	/* Pad to end of cacheline */
	while ((unsigned long)batch % CACHELINE_BYTES)
		*batch++ = MI_NOOP;

	return batch;
}

1564 1565 1566
#define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)

static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
1567
{
1568 1569 1570
	struct drm_i915_gem_object *obj;
	struct i915_vma *vma;
	int err;
1571

1572
	obj = i915_gem_object_create(engine->i915, CTX_WA_BB_OBJ_SIZE);
1573 1574
	if (IS_ERR(obj))
		return PTR_ERR(obj);
1575

1576
	vma = i915_vma_instance(obj, &engine->i915->ggtt.base, NULL);
1577 1578 1579
	if (IS_ERR(vma)) {
		err = PTR_ERR(vma);
		goto err;
1580 1581
	}

1582 1583 1584 1585 1586
	err = i915_vma_pin(vma, 0, PAGE_SIZE, PIN_GLOBAL | PIN_HIGH);
	if (err)
		goto err;

	engine->wa_ctx.vma = vma;
1587
	return 0;
1588 1589 1590 1591

err:
	i915_gem_object_put(obj);
	return err;
1592 1593
}

1594
static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
1595
{
1596
	i915_vma_unpin_and_release(&engine->wa_ctx.vma);
1597 1598
}

1599 1600
typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);

1601
static int intel_init_workaround_bb(struct intel_engine_cs *engine)
1602
{
1603
	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1604 1605 1606
	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
					    &wa_ctx->per_ctx };
	wa_bb_func_t wa_bb_fn[2];
1607
	struct page *page;
1608 1609
	void *batch, *batch_ptr;
	unsigned int i;
1610
	int ret;
1611

1612
	if (GEM_WARN_ON(engine->id != RCS))
1613
		return -EINVAL;
1614

1615
	switch (INTEL_GEN(engine->i915)) {
1616
	case 10:
1617 1618 1619
		wa_bb_fn[0] = gen10_init_indirectctx_bb;
		wa_bb_fn[1] = NULL;
		break;
1620 1621
	case 9:
		wa_bb_fn[0] = gen9_init_indirectctx_bb;
1622
		wa_bb_fn[1] = NULL;
1623 1624 1625
		break;
	case 8:
		wa_bb_fn[0] = gen8_init_indirectctx_bb;
1626
		wa_bb_fn[1] = NULL;
1627 1628 1629
		break;
	default:
		MISSING_CASE(INTEL_GEN(engine->i915));
1630
		return 0;
1631
	}
1632

1633
	ret = lrc_setup_wa_ctx(engine);
1634 1635 1636 1637 1638
	if (ret) {
		DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
		return ret;
	}

1639
	page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
1640
	batch = batch_ptr = kmap_atomic(page);
1641

1642 1643 1644 1645 1646 1647 1648
	/*
	 * Emit the two workaround batch buffers, recording the offset from the
	 * start of the workaround batch buffer object for each and their
	 * respective sizes.
	 */
	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
		wa_bb[i]->offset = batch_ptr - batch;
1649 1650
		if (GEM_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
					    CACHELINE_BYTES))) {
1651 1652 1653
			ret = -EINVAL;
			break;
		}
1654 1655
		if (wa_bb_fn[i])
			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1656
		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1657 1658
	}

1659 1660
	BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);

1661 1662
	kunmap_atomic(batch);
	if (ret)
1663
		lrc_destroy_wa_ctx(engine);
1664 1665 1666 1667

	return ret;
}

1668
static void enable_execlists(struct intel_engine_cs *engine)
1669
{
1670
	struct drm_i915_private *dev_priv = engine->i915;
1671 1672

	I915_WRITE(RING_HWSTAM(engine->mmio_base), 0xffffffff);
1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688

	/*
	 * Make sure we're not enabling the new 12-deep CSB
	 * FIFO as that requires a slightly updated handling
	 * in the ctx switch irq. Since we're currently only
	 * using only 2 elements of the enhanced execlists the
	 * deeper FIFO it's not needed and it's not worth adding
	 * more statements to the irq handler to support it.
	 */
	if (INTEL_GEN(dev_priv) >= 11)
		I915_WRITE(RING_MODE_GEN7(engine),
			   _MASKED_BIT_DISABLE(GEN11_GFX_DISABLE_LEGACY_MODE));
	else
		I915_WRITE(RING_MODE_GEN7(engine),
			   _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE));

1689 1690 1691
	I915_WRITE(RING_HWS_PGA(engine->mmio_base),
		   engine->status_page.ggtt_offset);
	POSTING_READ(RING_HWS_PGA(engine->mmio_base));
1692 1693 1694

	/* Following the reset, we need to reload the CSB read/write pointers */
	engine->execlists.csb_head = -1;
1695 1696 1697 1698
}

static int gen8_init_common_ring(struct intel_engine_cs *engine)
{
1699
	struct intel_engine_execlists * const execlists = &engine->execlists;
1700 1701 1702 1703 1704
	int ret;

	ret = intel_mocs_init_engine(engine);
	if (ret)
		return ret;
1705

1706
	intel_engine_reset_breadcrumbs(engine);
1707
	intel_engine_init_hangcheck(engine);
1708

1709
	enable_execlists(engine);
1710

1711
	/* After a GPU reset, we may have requests to replay */
1712
	if (execlists->first)
1713
		tasklet_schedule(&execlists->tasklet);
1714

1715
	return 0;
1716 1717
}

1718
static int gen8_init_render_ring(struct intel_engine_cs *engine)
1719
{
1720
	struct drm_i915_private *dev_priv = engine->i915;
1721 1722
	int ret;

1723
	ret = gen8_init_common_ring(engine);
1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736
	if (ret)
		return ret;

	/* We need to disable the AsyncFlip performance optimisations in order
	 * to use MI_WAIT_FOR_EVENT within the CS. It should already be
	 * programmed to '1' on all products.
	 *
	 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv,bdw,chv
	 */
	I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE));

	I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));

1737
	return init_workarounds_ring(engine);
1738 1739
}

1740
static int gen9_init_render_ring(struct intel_engine_cs *engine)
1741 1742 1743
{
	int ret;

1744
	ret = gen8_init_common_ring(engine);
1745 1746 1747
	if (ret)
		return ret;

1748
	return init_workarounds_ring(engine);
1749 1750
}

1751
static void reset_common_ring(struct intel_engine_cs *engine,
1752
			      struct i915_request *request)
1753
{
1754
	struct intel_engine_execlists * const execlists = &engine->execlists;
1755
	struct intel_context *ce;
1756
	unsigned long flags;
1757

1758 1759 1760
	GEM_TRACE("%s request global=%x, current=%d\n",
		  engine->name, request ? request->global_seqno : 0,
		  intel_engine_get_seqno(engine));
1761

1762 1763
	/* See execlists_cancel_requests() for the irq/spinlock split. */
	local_irq_save(flags);
1764

1765 1766 1767 1768 1769 1770 1771 1772 1773
	/*
	 * Catch up with any missed context-switch interrupts.
	 *
	 * Ideally we would just read the remaining CSB entries now that we
	 * know the gpu is idle. However, the CSB registers are sometimes^W
	 * often trashed across a GPU reset! Instead we have to rely on
	 * guessing the missed context-switch events by looking at what
	 * requests were completed.
	 */
1774
	execlists_cancel_port_requests(execlists);
1775
	reset_irq(engine);
1776

1777
	/* Push back any incomplete requests for replay after the reset. */
1778
	spin_lock(&engine->timeline->lock);
1779
	__unwind_incomplete_requests(engine);
1780
	spin_unlock(&engine->timeline->lock);
1781

1782
	local_irq_restore(flags);
1783

1784 1785
	/*
	 * If the request was innocent, we leave the request in the ELSP
1786 1787 1788 1789 1790 1791 1792 1793 1794
	 * and will try to replay it on restarting. The context image may
	 * have been corrupted by the reset, in which case we may have
	 * to service a new GPU hang, but more likely we can continue on
	 * without impact.
	 *
	 * If the request was guilty, we presume the context is corrupt
	 * and have to at least restore the RING register in the context
	 * image back to the expected values to skip over the guilty request.
	 */
1795
	if (!request || request->fence.error != -EIO)
1796
		return;
1797

1798 1799
	/*
	 * We want a simple context + ring to execute the breadcrumb update.
1800 1801 1802 1803 1804 1805
	 * We cannot rely on the context being intact across the GPU hang,
	 * so clear it and rebuild just what we need for the breadcrumb.
	 * All pending requests for this context will be zapped, and any
	 * future request will be after userspace has had the opportunity
	 * to recreate its own state.
	 */
1806
	ce = &request->ctx->engine[engine->id];
1807 1808 1809
	execlists_init_reg_state(ce->lrc_reg_state,
				 request->ctx, engine, ce->ring);

1810
	/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
1811 1812
	ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
		i915_ggtt_offset(ce->ring->vma);
1813
	ce->lrc_reg_state[CTX_RING_HEAD+1] = request->postfix;
1814

1815 1816 1817
	request->ring->head = request->postfix;
	intel_ring_update_space(request->ring);

1818
	/* Reset WaIdleLiteRestore:bdw,skl as well */
1819
	unwind_wa_tail(request);
1820 1821
}

1822
static int intel_logical_ring_emit_pdps(struct i915_request *rq)
1823
{
1824 1825
	struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt;
	struct intel_engine_cs *engine = rq->engine;
1826
	const int num_lri_cmds = GEN8_3LVL_PDPES * 2;
1827 1828
	u32 *cs;
	int i;
1829

1830
	cs = intel_ring_begin(rq, num_lri_cmds * 2 + 2);
1831 1832
	if (IS_ERR(cs))
		return PTR_ERR(cs);
1833

1834
	*cs++ = MI_LOAD_REGISTER_IMM(num_lri_cmds);
1835
	for (i = GEN8_3LVL_PDPES - 1; i >= 0; i--) {
1836 1837
		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);

1838 1839 1840 1841
		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, i));
		*cs++ = upper_32_bits(pd_daddr);
		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, i));
		*cs++ = lower_32_bits(pd_daddr);
1842 1843
	}

1844
	*cs++ = MI_NOOP;
1845
	intel_ring_advance(rq, cs);
1846 1847 1848 1849

	return 0;
}

1850
static int gen8_emit_bb_start(struct i915_request *rq,
1851
			      u64 offset, u32 len,
1852
			      const unsigned int flags)
1853
{
1854
	u32 *cs;
1855 1856
	int ret;

1857 1858 1859 1860
	/* Don't rely in hw updating PDPs, specially in lite-restore.
	 * Ideally, we should set Force PD Restore in ctx descriptor,
	 * but we can't. Force Restore would be a second option, but
	 * it is unsafe in case of lite-restore (because the ctx is
1861 1862
	 * not idle). PML4 is allocated during ppgtt init so this is
	 * not needed in 48-bit.*/
1863 1864 1865 1866 1867
	if (rq->ctx->ppgtt &&
	    (intel_engine_flag(rq->engine) & rq->ctx->ppgtt->pd_dirty_rings) &&
	    !i915_vm_is_48bit(&rq->ctx->ppgtt->base) &&
	    !intel_vgpu_active(rq->i915)) {
		ret = intel_logical_ring_emit_pdps(rq);
1868 1869
		if (ret)
			return ret;
1870

1871
		rq->ctx->ppgtt->pd_dirty_rings &= ~intel_engine_flag(rq->engine);
1872 1873
	}

1874
	cs = intel_ring_begin(rq, 4);
1875 1876
	if (IS_ERR(cs))
		return PTR_ERR(cs);
1877

1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894
	/*
	 * WaDisableCtxRestoreArbitration:bdw,chv
	 *
	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
	 * particular all the gen that do not need the w/a at all!), if we
	 * took care to make sure that on every switch into this context
	 * (both ordinary and for preemption) that arbitrartion was enabled
	 * we would be fine. However, there doesn't seem to be a downside to
	 * being paranoid and making sure it is set before each batch and
	 * every context-switch.
	 *
	 * Note that if we fail to enable arbitration before the request
	 * is complete, then we do not see the context-switch interrupt and
	 * the engine hangs (with RING_HEAD == RING_TAIL).
	 *
	 * That satisfies both the GPGPU w/a and our heavy-handed paranoia.
	 */
1895 1896
	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;

1897
	/* FIXME(BDW): Address space and security selectors. */
1898 1899 1900
	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8)) |
		(flags & I915_DISPATCH_RS ? MI_BATCH_RESOURCE_STREAMER : 0);
1901 1902
	*cs++ = lower_32_bits(offset);
	*cs++ = upper_32_bits(offset);
1903
	intel_ring_advance(rq, cs);
1904 1905 1906 1907

	return 0;
}

1908
static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
1909
{
1910
	struct drm_i915_private *dev_priv = engine->i915;
1911 1912 1913
	I915_WRITE_IMR(engine,
		       ~(engine->irq_enable_mask | engine->irq_keep_mask));
	POSTING_READ_FW(RING_IMR(engine->mmio_base));
1914 1915
}

1916
static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
1917
{
1918
	struct drm_i915_private *dev_priv = engine->i915;
1919
	I915_WRITE_IMR(engine, ~engine->irq_keep_mask);
1920 1921
}

1922
static int gen8_emit_flush(struct i915_request *request, u32 mode)
1923
{
1924
	u32 cmd, *cs;
1925

1926 1927 1928
	cs = intel_ring_begin(request, 4);
	if (IS_ERR(cs))
		return PTR_ERR(cs);
1929 1930 1931

	cmd = MI_FLUSH_DW + 1;

1932 1933 1934 1935 1936 1937 1938
	/* We always require a command barrier so that subsequent
	 * commands, such as breadcrumb interrupts, are strictly ordered
	 * wrt the contents of the write cache being flushed to memory
	 * (and thus being coherent from the CPU).
	 */
	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;

1939
	if (mode & EMIT_INVALIDATE) {
1940
		cmd |= MI_INVALIDATE_TLB;
1941
		if (request->engine->id == VCS)
1942
			cmd |= MI_INVALIDATE_BSD;
1943 1944
	}

1945 1946 1947 1948 1949
	*cs++ = cmd;
	*cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
	*cs++ = 0; /* upper addr */
	*cs++ = 0; /* value */
	intel_ring_advance(request, cs);
1950 1951 1952 1953

	return 0;
}

1954
static int gen8_emit_flush_render(struct i915_request *request,
1955
				  u32 mode)
1956
{
1957
	struct intel_engine_cs *engine = request->engine;
1958 1959
	u32 scratch_addr =
		i915_ggtt_offset(engine->scratch) + 2 * CACHELINE_BYTES;
M
Mika Kuoppala 已提交
1960
	bool vf_flush_wa = false, dc_flush_wa = false;
1961
	u32 *cs, flags = 0;
M
Mika Kuoppala 已提交
1962
	int len;
1963 1964 1965

	flags |= PIPE_CONTROL_CS_STALL;

1966
	if (mode & EMIT_FLUSH) {
1967 1968
		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
1969
		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
1970
		flags |= PIPE_CONTROL_FLUSH_ENABLE;
1971 1972
	}

1973
	if (mode & EMIT_INVALIDATE) {
1974 1975 1976 1977 1978 1979 1980 1981 1982
		flags |= PIPE_CONTROL_TLB_INVALIDATE;
		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_QW_WRITE;
		flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;

1983 1984 1985 1986
		/*
		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
		 * pipe control.
		 */
1987
		if (IS_GEN9(request->i915))
1988
			vf_flush_wa = true;
M
Mika Kuoppala 已提交
1989 1990 1991 1992

		/* WaForGAMHang:kbl */
		if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
			dc_flush_wa = true;
1993
	}
1994

M
Mika Kuoppala 已提交
1995 1996 1997 1998 1999 2000 2001 2002
	len = 6;

	if (vf_flush_wa)
		len += 6;

	if (dc_flush_wa)
		len += 12;

2003 2004 2005
	cs = intel_ring_begin(request, len);
	if (IS_ERR(cs))
		return PTR_ERR(cs);
2006

2007 2008
	if (vf_flush_wa)
		cs = gen8_emit_pipe_control(cs, 0, 0);
2009

2010 2011 2012
	if (dc_flush_wa)
		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
					    0);
M
Mika Kuoppala 已提交
2013

2014
	cs = gen8_emit_pipe_control(cs, flags, scratch_addr);
M
Mika Kuoppala 已提交
2015

2016 2017
	if (dc_flush_wa)
		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
M
Mika Kuoppala 已提交
2018

2019
	intel_ring_advance(request, cs);
2020 2021 2022 2023

	return 0;
}

2024 2025 2026 2027 2028
/*
 * Reserve space for 2 NOOPs at the end of each request to be
 * used as a workaround for not being allowed to do lite
 * restore with HEAD==TAIL (WaIdleLiteRestore).
 */
2029
static void gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
2030
{
C
Chris Wilson 已提交
2031 2032
	/* Ensure there's always at least one preemption point per-request. */
	*cs++ = MI_ARB_CHECK;
2033 2034
	*cs++ = MI_NOOP;
	request->wa_tail = intel_ring_offset(request, cs);
C
Chris Wilson 已提交
2035
}
2036

2037
static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
C
Chris Wilson 已提交
2038
{
2039 2040
	/* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
	BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
2041

2042 2043
	cs = gen8_emit_ggtt_write(cs, request->global_seqno,
				  intel_hws_seqno_address(request->engine));
2044 2045 2046
	*cs++ = MI_USER_INTERRUPT;
	*cs++ = MI_NOOP;
	request->tail = intel_ring_offset(request, cs);
2047
	assert_ring_tail_valid(request->ring, request->tail);
C
Chris Wilson 已提交
2048

2049
	gen8_emit_wa_tail(request, cs);
2050
}
2051 2052
static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS;

2053
static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
2054
{
2055 2056 2057
	/* We're using qword write, seqno should be aligned to 8 bytes. */
	BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1);

2058 2059
	cs = gen8_emit_ggtt_write_rcs(cs, request->global_seqno,
				      intel_hws_seqno_address(request->engine));
2060 2061 2062
	*cs++ = MI_USER_INTERRUPT;
	*cs++ = MI_NOOP;
	request->tail = intel_ring_offset(request, cs);
2063
	assert_ring_tail_valid(request->ring, request->tail);
C
Chris Wilson 已提交
2064

2065
	gen8_emit_wa_tail(request, cs);
2066
}
2067
static const int gen8_emit_breadcrumb_rcs_sz = 8 + WA_TAIL_DWORDS;
2068

2069
static int gen8_init_rcs_context(struct i915_request *rq)
2070 2071 2072
{
	int ret;

2073
	ret = intel_ring_workarounds_emit(rq);
2074 2075 2076
	if (ret)
		return ret;

2077
	ret = intel_rcs_context_init_mocs(rq);
2078 2079 2080 2081 2082 2083 2084
	/*
	 * Failing to program the MOCS is non-fatal.The system will not
	 * run at peak performance. So generate an error and carry on.
	 */
	if (ret)
		DRM_ERROR("MOCS failed to program: expect performance issues.\n");

2085
	return i915_gem_render_state_emit(rq);
2086 2087
}

2088 2089
/**
 * intel_logical_ring_cleanup() - deallocate the Engine Command Streamer
2090
 * @engine: Engine Command Streamer.
2091
 */
2092
void intel_logical_ring_cleanup(struct intel_engine_cs *engine)
2093
{
2094
	struct drm_i915_private *dev_priv;
2095

2096 2097 2098 2099
	/*
	 * Tasklet cannot be active at this point due intel_mark_active/idle
	 * so this is just for documentation.
	 */
2100 2101 2102
	if (WARN_ON(test_bit(TASKLET_STATE_SCHED,
			     &engine->execlists.tasklet.state)))
		tasklet_kill(&engine->execlists.tasklet);
2103

2104
	dev_priv = engine->i915;
2105

2106 2107
	if (engine->buffer) {
		WARN_ON((I915_READ_MODE(engine) & MODE_IDLE) == 0);
2108
	}
2109

2110 2111
	if (engine->cleanup)
		engine->cleanup(engine);
2112

2113
	intel_engine_cleanup_common(engine);
2114

2115
	lrc_destroy_wa_ctx(engine);
2116

2117
	engine->i915 = NULL;
2118 2119
	dev_priv->engine[engine->id] = NULL;
	kfree(engine);
2120 2121
}

2122
static void execlists_set_default_submission(struct intel_engine_cs *engine)
2123
{
2124
	engine->submit_request = execlists_submit_request;
2125
	engine->cancel_requests = execlists_cancel_requests;
2126
	engine->schedule = execlists_schedule;
2127
	engine->execlists.tasklet.func = execlists_submission_tasklet;
2128 2129 2130

	engine->park = NULL;
	engine->unpark = NULL;
2131 2132

	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
2133 2134
	if (engine->i915->preempt_context)
		engine->flags |= I915_ENGINE_HAS_PREEMPTION;
2135 2136 2137 2138

	engine->i915->caps.scheduler =
		I915_SCHEDULER_CAP_ENABLED |
		I915_SCHEDULER_CAP_PRIORITY;
2139
	if (intel_engine_has_preemption(engine))
2140
		engine->i915->caps.scheduler |= I915_SCHEDULER_CAP_PREEMPTION;
2141 2142
}

2143
static void
2144
logical_ring_default_vfuncs(struct intel_engine_cs *engine)
2145 2146
{
	/* Default vfuncs which can be overriden by each engine. */
2147
	engine->init_hw = gen8_init_common_ring;
2148
	engine->reset_hw = reset_common_ring;
2149 2150 2151 2152

	engine->context_pin = execlists_context_pin;
	engine->context_unpin = execlists_context_unpin;

2153 2154
	engine->request_alloc = execlists_request_alloc;

2155
	engine->emit_flush = gen8_emit_flush;
2156
	engine->emit_breadcrumb = gen8_emit_breadcrumb;
2157
	engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
2158 2159

	engine->set_default_submission = execlists_set_default_submission;
2160

2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171
	if (INTEL_GEN(engine->i915) < 11) {
		engine->irq_enable = gen8_logical_ring_enable_irq;
		engine->irq_disable = gen8_logical_ring_disable_irq;
	} else {
		/*
		 * TODO: On Gen11 interrupt masks need to be clear
		 * to allow C6 entry. Keep interrupts enabled at
		 * and take the hit of generating extra interrupts
		 * until a more refined solution exists.
		 */
	}
2172
	engine->emit_bb_start = gen8_emit_bb_start;
2173 2174
}

2175
static inline void
2176
logical_ring_default_irqs(struct intel_engine_cs *engine)
2177
{
2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191
	unsigned int shift = 0;

	if (INTEL_GEN(engine->i915) < 11) {
		const u8 irq_shifts[] = {
			[RCS]  = GEN8_RCS_IRQ_SHIFT,
			[BCS]  = GEN8_BCS_IRQ_SHIFT,
			[VCS]  = GEN8_VCS1_IRQ_SHIFT,
			[VCS2] = GEN8_VCS2_IRQ_SHIFT,
			[VECS] = GEN8_VECS_IRQ_SHIFT,
		};

		shift = irq_shifts[engine->id];
	}

2192 2193
	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
2194 2195
}

2196 2197 2198 2199 2200 2201
static void
logical_ring_setup(struct intel_engine_cs *engine)
{
	struct drm_i915_private *dev_priv = engine->i915;
	enum forcewake_domains fw_domains;

2202 2203
	intel_engine_setup_common(engine);

2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218
	/* Intentionally left blank. */
	engine->buffer = NULL;

	fw_domains = intel_uncore_forcewake_for_reg(dev_priv,
						    RING_ELSP(engine),
						    FW_REG_WRITE);

	fw_domains |= intel_uncore_forcewake_for_reg(dev_priv,
						     RING_CONTEXT_STATUS_PTR(engine),
						     FW_REG_READ | FW_REG_WRITE);

	fw_domains |= intel_uncore_forcewake_for_reg(dev_priv,
						     RING_CONTEXT_STATUS_BUF_BASE(engine),
						     FW_REG_READ);

2219
	engine->execlists.fw_domains = fw_domains;
2220

2221 2222
	tasklet_init(&engine->execlists.tasklet,
		     execlists_submission_tasklet, (unsigned long)engine);
2223 2224 2225 2226 2227

	logical_ring_default_vfuncs(engine);
	logical_ring_default_irqs(engine);
}

2228
static int logical_ring_init(struct intel_engine_cs *engine)
2229 2230 2231
{
	int ret;

2232
	ret = intel_engine_init_common(engine);
2233 2234 2235
	if (ret)
		goto error;

2236 2237 2238 2239 2240 2241 2242 2243 2244
	if (HAS_LOGICAL_RING_ELSQ(engine->i915)) {
		engine->execlists.submit_reg = engine->i915->regs +
			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(engine));
		engine->execlists.ctrl_reg = engine->i915->regs +
			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(engine));
	} else {
		engine->execlists.submit_reg = engine->i915->regs +
			i915_mmio_reg_offset(RING_ELSP(engine));
	}
2245

2246 2247 2248 2249 2250
	engine->execlists.preempt_complete_status = ~0u;
	if (engine->i915->preempt_context)
		engine->execlists.preempt_complete_status =
			upper_32_bits(engine->i915->preempt_context->engine[engine->id].lrc_desc);

2251 2252 2253 2254 2255 2256 2257
	return 0;

error:
	intel_logical_ring_cleanup(engine);
	return ret;
}

2258
int logical_render_ring_init(struct intel_engine_cs *engine)
2259 2260 2261 2262
{
	struct drm_i915_private *dev_priv = engine->i915;
	int ret;

2263 2264
	logical_ring_setup(engine);

2265 2266 2267 2268 2269 2270 2271 2272 2273 2274
	if (HAS_L3_DPF(dev_priv))
		engine->irq_keep_mask |= GT_RENDER_L3_PARITY_ERROR_INTERRUPT;

	/* Override some for render ring. */
	if (INTEL_GEN(dev_priv) >= 9)
		engine->init_hw = gen9_init_render_ring;
	else
		engine->init_hw = gen8_init_render_ring;
	engine->init_context = gen8_init_rcs_context;
	engine->emit_flush = gen8_emit_flush_render;
2275 2276
	engine->emit_breadcrumb = gen8_emit_breadcrumb_rcs;
	engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_rcs_sz;
2277

2278
	ret = intel_engine_create_scratch(engine, PAGE_SIZE);
2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292
	if (ret)
		return ret;

	ret = intel_init_workaround_bb(engine);
	if (ret) {
		/*
		 * We continue even if we fail to initialize WA batch
		 * because we only expect rare glitches but nothing
		 * critical to prevent us from using GPU
		 */
		DRM_ERROR("WA batch buffer initialization failed: %d\n",
			  ret);
	}

2293
	return logical_ring_init(engine);
2294 2295
}

2296
int logical_xcs_ring_init(struct intel_engine_cs *engine)
2297 2298 2299 2300
{
	logical_ring_setup(engine);

	return logical_ring_init(engine);
2301 2302
}

2303
static u32
2304
make_rpcs(struct drm_i915_private *dev_priv)
2305 2306 2307 2308 2309 2310 2311
{
	u32 rpcs = 0;

	/*
	 * No explicit RPCS request is needed to ensure full
	 * slice/subslice/EU enablement prior to Gen9.
	*/
2312
	if (INTEL_GEN(dev_priv) < 9)
2313 2314 2315 2316 2317 2318 2319 2320
		return 0;

	/*
	 * Starting in Gen9, render power gating can leave
	 * slice/subslice/EU in a partially enabled state. We
	 * must make an explicit request through RPCS for full
	 * enablement.
	*/
2321
	if (INTEL_INFO(dev_priv)->sseu.has_slice_pg) {
2322
		rpcs |= GEN8_RPCS_S_CNT_ENABLE;
2323
		rpcs |= hweight8(INTEL_INFO(dev_priv)->sseu.slice_mask) <<
2324 2325 2326 2327
			GEN8_RPCS_S_CNT_SHIFT;
		rpcs |= GEN8_RPCS_ENABLE;
	}

2328
	if (INTEL_INFO(dev_priv)->sseu.has_subslice_pg) {
2329
		rpcs |= GEN8_RPCS_SS_CNT_ENABLE;
2330
		rpcs |= hweight8(INTEL_INFO(dev_priv)->sseu.subslice_mask[0]) <<
2331 2332 2333 2334
			GEN8_RPCS_SS_CNT_SHIFT;
		rpcs |= GEN8_RPCS_ENABLE;
	}

2335 2336
	if (INTEL_INFO(dev_priv)->sseu.has_eu_pg) {
		rpcs |= INTEL_INFO(dev_priv)->sseu.eu_per_subslice <<
2337
			GEN8_RPCS_EU_MIN_SHIFT;
2338
		rpcs |= INTEL_INFO(dev_priv)->sseu.eu_per_subslice <<
2339 2340 2341 2342 2343 2344 2345
			GEN8_RPCS_EU_MAX_SHIFT;
		rpcs |= GEN8_RPCS_ENABLE;
	}

	return rpcs;
}

2346
static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine)
2347 2348 2349
{
	u32 indirect_ctx_offset;

2350
	switch (INTEL_GEN(engine->i915)) {
2351
	default:
2352
		MISSING_CASE(INTEL_GEN(engine->i915));
2353
		/* fall through */
2354 2355 2356 2357
	case 11:
		indirect_ctx_offset =
			GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
		break;
2358 2359 2360 2361
	case 10:
		indirect_ctx_offset =
			GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
		break;
2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374
	case 9:
		indirect_ctx_offset =
			GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
		break;
	case 8:
		indirect_ctx_offset =
			GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
		break;
	}

	return indirect_ctx_offset;
}

2375
static void execlists_init_reg_state(u32 *regs,
2376 2377 2378
				     struct i915_gem_context *ctx,
				     struct intel_engine_cs *engine,
				     struct intel_ring *ring)
2379
{
2380 2381
	struct drm_i915_private *dev_priv = engine->i915;
	struct i915_hw_ppgtt *ppgtt = ctx->ppgtt ?: dev_priv->mm.aliasing_ppgtt;
2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395
	u32 base = engine->mmio_base;
	bool rcs = engine->id == RCS;

	/* A context is actually a big batch buffer with several
	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
	 * values we are setting here are only for the first context restore:
	 * on a subsequent save, the GPU will recreate this batchbuffer with new
	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
	 * we are not initializing here).
	 */
	regs[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) |
				 MI_LRI_FORCE_POSTED;

	CTX_REG(regs, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(engine),
2396 2397
		_MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
				    CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT) |
2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412
		_MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
				   (HAS_RESOURCE_STREAMER(dev_priv) ?
				   CTX_CTRL_RS_CTX_ENABLE : 0)));
	CTX_REG(regs, CTX_RING_HEAD, RING_HEAD(base), 0);
	CTX_REG(regs, CTX_RING_TAIL, RING_TAIL(base), 0);
	CTX_REG(regs, CTX_RING_BUFFER_START, RING_START(base), 0);
	CTX_REG(regs, CTX_RING_BUFFER_CONTROL, RING_CTL(base),
		RING_CTL_SIZE(ring->size) | RING_VALID);
	CTX_REG(regs, CTX_BB_HEAD_U, RING_BBADDR_UDW(base), 0);
	CTX_REG(regs, CTX_BB_HEAD_L, RING_BBADDR(base), 0);
	CTX_REG(regs, CTX_BB_STATE, RING_BBSTATE(base), RING_BB_PPGTT);
	CTX_REG(regs, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(base), 0);
	CTX_REG(regs, CTX_SECOND_BB_HEAD_L, RING_SBBADDR(base), 0);
	CTX_REG(regs, CTX_SECOND_BB_STATE, RING_SBBSTATE(base), 0);
	if (rcs) {
2413 2414
		struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;

2415 2416 2417
		CTX_REG(regs, CTX_RCS_INDIRECT_CTX, RING_INDIRECT_CTX(base), 0);
		CTX_REG(regs, CTX_RCS_INDIRECT_CTX_OFFSET,
			RING_INDIRECT_CTX_OFFSET(base), 0);
2418
		if (wa_ctx->indirect_ctx.size) {
2419
			u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
2420

2421
			regs[CTX_RCS_INDIRECT_CTX + 1] =
2422 2423
				(ggtt_offset + wa_ctx->indirect_ctx.offset) |
				(wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
2424

2425
			regs[CTX_RCS_INDIRECT_CTX_OFFSET + 1] =
2426
				intel_lr_indirect_ctx_offset(engine) << 6;
2427 2428 2429 2430 2431
		}

		CTX_REG(regs, CTX_BB_PER_CTX_PTR, RING_BB_PER_CTX_PTR(base), 0);
		if (wa_ctx->per_ctx.size) {
			u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
2432

2433
			regs[CTX_BB_PER_CTX_PTR + 1] =
2434
				(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
2435
		}
2436
	}
2437 2438 2439 2440

	regs[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) | MI_LRI_FORCE_POSTED;

	CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0);
2441
	/* PDP values well be assigned later if needed */
2442 2443 2444 2445 2446 2447 2448 2449
	CTX_REG(regs, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(engine, 3), 0);
	CTX_REG(regs, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(engine, 3), 0);
	CTX_REG(regs, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(engine, 2), 0);
	CTX_REG(regs, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(engine, 2), 0);
	CTX_REG(regs, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(engine, 1), 0);
	CTX_REG(regs, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(engine, 1), 0);
	CTX_REG(regs, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(engine, 0), 0);
	CTX_REG(regs, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(engine, 0), 0);
2450

2451
	if (ppgtt && i915_vm_is_48bit(&ppgtt->base)) {
2452 2453 2454 2455
		/* 64b PPGTT (48bit canonical)
		 * PDP0_DESCRIPTOR contains the base address to PML4 and
		 * other PDP Descriptors are ignored.
		 */
2456
		ASSIGN_CTX_PML4(ppgtt, regs);
2457 2458
	}

2459 2460 2461 2462
	if (rcs) {
		regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
		CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
			make_rpcs(dev_priv));
2463 2464

		i915_oa_init_reg_state(engine, ctx, regs);
2465
	}
2466 2467 2468 2469 2470 2471 2472 2473 2474
}

static int
populate_lr_context(struct i915_gem_context *ctx,
		    struct drm_i915_gem_object *ctx_obj,
		    struct intel_engine_cs *engine,
		    struct intel_ring *ring)
{
	void *vaddr;
2475
	u32 *regs;
2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489
	int ret;

	ret = i915_gem_object_set_to_cpu_domain(ctx_obj, true);
	if (ret) {
		DRM_DEBUG_DRIVER("Could not set to CPU domain\n");
		return ret;
	}

	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
	if (IS_ERR(vaddr)) {
		ret = PTR_ERR(vaddr);
		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
		return ret;
	}
C
Chris Wilson 已提交
2490
	ctx_obj->mm.dirty = true;
2491

2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509
	if (engine->default_state) {
		/*
		 * We only want to copy over the template context state;
		 * skipping over the headers reserved for GuC communication,
		 * leaving those as zero.
		 */
		const unsigned long start = LRC_HEADER_PAGES * PAGE_SIZE;
		void *defaults;

		defaults = i915_gem_object_pin_map(engine->default_state,
						   I915_MAP_WB);
		if (IS_ERR(defaults))
			return PTR_ERR(defaults);

		memcpy(vaddr + start, defaults + start, engine->context_size);
		i915_gem_object_unpin_map(engine->default_state);
	}

2510 2511
	/* The second page of the context object contains some fields which must
	 * be set up prior to the first execution. */
2512 2513 2514 2515 2516
	regs = vaddr + LRC_STATE_PN * PAGE_SIZE;
	execlists_init_reg_state(regs, ctx, engine, ring);
	if (!engine->default_state)
		regs[CTX_CONTEXT_CONTROL + 1] |=
			_MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
2517
	if (ctx == ctx->i915->preempt_context && INTEL_GEN(engine->i915) < 11)
2518 2519 2520
		regs[CTX_CONTEXT_CONTROL + 1] |=
			_MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
					   CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT);
2521

2522
	i915_gem_object_unpin_map(ctx_obj);
2523 2524 2525 2526

	return 0;
}

2527
static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
2528
					    struct intel_engine_cs *engine)
2529
{
2530
	struct drm_i915_gem_object *ctx_obj;
2531
	struct intel_context *ce = &ctx->engine[engine->id];
2532
	struct i915_vma *vma;
2533
	uint32_t context_size;
2534
	struct intel_ring *ring;
2535 2536
	int ret;

2537 2538
	if (ce->state)
		return 0;
2539

2540
	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
2541

2542 2543 2544 2545 2546
	/*
	 * Before the actual start of the context image, we insert a few pages
	 * for our own use and for sharing with the GuC.
	 */
	context_size += LRC_HEADER_PAGES * PAGE_SIZE;
2547

2548
	ctx_obj = i915_gem_object_create(ctx->i915, context_size);
2549
	if (IS_ERR(ctx_obj)) {
2550
		DRM_DEBUG_DRIVER("Alloc LRC backing obj failed.\n");
2551
		return PTR_ERR(ctx_obj);
2552 2553
	}

2554
	vma = i915_vma_instance(ctx_obj, &ctx->i915->ggtt.base, NULL);
2555 2556 2557 2558 2559
	if (IS_ERR(vma)) {
		ret = PTR_ERR(vma);
		goto error_deref_obj;
	}

2560
	ring = intel_engine_create_ring(engine, ctx->ring_size);
2561 2562
	if (IS_ERR(ring)) {
		ret = PTR_ERR(ring);
2563
		goto error_deref_obj;
2564 2565
	}

2566
	ret = populate_lr_context(ctx, ctx_obj, engine, ring);
2567 2568
	if (ret) {
		DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
2569
		goto error_ring_free;
2570 2571
	}

2572
	ce->ring = ring;
2573
	ce->state = vma;
2574 2575

	return 0;
2576

2577
error_ring_free:
2578
	intel_ring_free(ring);
2579
error_deref_obj:
2580
	i915_gem_object_put(ctx_obj);
2581
	return ret;
2582
}
2583

2584
void intel_lr_context_resume(struct drm_i915_private *dev_priv)
2585
{
2586
	struct intel_engine_cs *engine;
2587
	struct i915_gem_context *ctx;
2588
	enum intel_engine_id id;
2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599

	/* Because we emit WA_TAIL_DWORDS there may be a disparity
	 * between our bookkeeping in ce->ring->head and ce->ring->tail and
	 * that stored in context. As we only write new commands from
	 * ce->ring->tail onwards, everything before that is junk. If the GPU
	 * starts reading from its RING_HEAD from the context, it may try to
	 * execute that junk and die.
	 *
	 * So to avoid that we reset the context images upon resume. For
	 * simplicity, we just zero everything out.
	 */
2600
	list_for_each_entry(ctx, &dev_priv->contexts.list, link) {
2601
		for_each_engine(engine, dev_priv, id) {
2602 2603
			struct intel_context *ce = &ctx->engine[engine->id];
			u32 *reg;
2604

2605 2606
			if (!ce->state)
				continue;
2607

2608 2609 2610 2611
			reg = i915_gem_object_pin_map(ce->state->obj,
						      I915_MAP_WB);
			if (WARN_ON(IS_ERR(reg)))
				continue;
2612

2613 2614 2615
			reg += LRC_STATE_PN * PAGE_SIZE / sizeof(*reg);
			reg[CTX_RING_HEAD+1] = 0;
			reg[CTX_RING_TAIL+1] = 0;
2616

C
Chris Wilson 已提交
2617
			ce->state->obj->mm.dirty = true;
2618
			i915_gem_object_unpin_map(ce->state->obj);
2619

2620
			intel_ring_reset(ce->ring, 0);
2621
		}
2622 2623
	}
}
2624 2625 2626 2627

#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
#include "selftests/intel_lrc.c"
#endif