drm/i915: Restore context and pd for ringbuffer submission after reset

Following a reset, the context and page directory registers are lost. However, the queue of requests that we resubmit after the reset may depend upon them - the registers are restored from a context image, but that restore may be inhibited and may simply be absent from the request if it was in the middle of a sequence using the same context. If we prime the CCID/PD registers with the first request in the queue (even for the hung request), we prevent invalid memory access for the following requests (and continually hung engines). v2: Magic BIT(8), reserved for future use but still appears unused. v3: Some commentary on handling innocent vs guilty requests v4: Add a wait for PD_BASE fetch. The reload appears to be instant on my Ivybridge, but this bit probably exists for a reason. Fixes: 821ed7df ("drm/i915: Update reset path to fix incomplete requests") Signed-off-by: N Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Mika Kuoppala <mika.kuoppala@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170207152437.4252-1-chris@chris-wilson.co.ukReviewed-by: N Mika Kuoppala <mika.kuoppala@intel.com>

drm/i915: Restore context and pd for ringbuffer submission after reset
Following a reset, the context and page directory registers are lost. However, the queue of requests that we resubmit after the reset may depend upon them - the registers are restored from a context image, but that restore may be inhibited and may simply be absent from the request if it was in the middle of a sequence using the same context. If we prime the CCID/PD registers with the first request in the queue (even for the hung request), we prevent invalid memory access for the following requests (and continually hung engines). v2: Magic BIT(8), reserved for future use but still appears unused. v3: Some commentary on handling innocent vs guilty requests v4: Add a wait for PD_BASE fetch. The reload appears to be instant on my Ivybridge, but this bit probably exists for a reason. Fixes: 821ed7df ("drm/i915: Update reset path to fix incomplete requests") Signed-off-by: N Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Mika Kuoppala <mika.kuoppala@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170207152437.4252-1-chris@chris-wilson.co.ukReviewed-by: N Mika Kuoppala <mika.kuoppala@intel.com>
c0dcb203 · Chris Wilson · 6248017a · c0dcb203 · c0dcb203 · c0dcb203
4 changed file
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2745,21 +2745,17 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine)
 		engine->irq_seqno_barrier(engine);
 	request = i915_gem_find_active_request(engine);
-	if (!request)
+	if (request && i915_gem_reset_request(request)) {
-		return;
-	if (!i915_gem_reset_request(request))
-		return;
 		DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
 				 engine->name, request->global_seqno);
-	/* Setup the CS to resume from the breadcrumb of the hung request */
+		/* If this context is now banned, skip all pending requests. */
-	engine->reset_hw(engine, request);
-	/* If this context is now banned, skip all of its pending requests. */
 		if (i915_gem_context_is_banned(request->ctx))
 			engine_skip_context(request);
+	}
+	/* Setup the CS to resume from the breadcrumb of the hung request */
+	engine->reset_hw(engine, request);
 }
 void i915_gem_reset_finish(struct drm_i915_private *dev_priv)

--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -3308,7 +3308,9 @@ enum skl_disp_power_wells {
 * Logical Context regs
 */
 #define CCID				_MMIO(0x2180)
-#define   CCID_EN		(1<<0)
+#define   CCID_EN			BIT(0)
+#define   CCID_EXTENDED_STATE_RESTORE	BIT(2)
+#define   CCID_EXTENDED_STATE_SAVE	BIT(3)
 /*
 * Notes on SNB/IVB/VLV context size:
 * - Power context is saved elsewhere (LLC or stolen)

--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1352,7 +1352,20 @@ static void reset_common_ring(struct intel_engine_cs *engine,
 			      struct drm_i915_gem_request *request)
 {
 	struct execlist_port *port = engine->execlist_port;
-	struct intel_context *ce = &request->ctx->engine[engine->id];
+	struct intel_context *ce;
+	/* If the request was innocent, we leave the request in the ELSP
+	 * and will try to replay it on restarting. The context image may
+	 * have been corrupted by the reset, in which case we may have
+	 * to service a new GPU hang, but more likely we can continue on
+	 * without impact.
+	 *
+	 * If the request was guilty, we presume the context is corrupt
+	 * and have to at least restore the RING register in the context
+	 * image back to the expected values to skip over the guilty request.
+	 */
+	if (!request || request->fence.error != -EIO)
+		return;
 	/* We want a simple context + ring to execute the breadcrumb update.
 	 * We cannot rely on the context being intact across the GPU hang,
@@ -1361,6 +1374,7 @@ static void reset_common_ring(struct intel_engine_cs *engine,
 	 * future request will be after userspace has had the opportunity
 	 * to recreate its own state.
 	 */
+	ce = &request->ctx->engine[engine->id];
 	execlists_init_reg_state(ce->lrc_reg_state,
 				 request->ctx, engine, ce->ring);

--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -599,10 +599,62 @@ static int init_ring_common(struct intel_engine_cs *engine)
 static void reset_ring_common(struct intel_engine_cs *engine,
 			      struct drm_i915_gem_request *request)
 {
+	/* Try to restore the logical GPU state to match the continuation
+	 * of the request queue. If we skip the context/PD restore, then
+	 * the next request may try to execute assuming that its context
+	 * is valid and loaded on the GPU and so may try to access invalid
+	 * memory, prompting repeated GPU hangs.
+	 *
+	 * If the request was guilty, we still restore the logical state
+	 * in case the next request requires it (e.g. the aliasing ppgtt),
+	 * but skip over the hung batch.
+	 *
+	 * If the request was innocent, we try to replay the request with
+	 * the restored context.
+	 */
+	if (request) {
+		struct drm_i915_private *dev_priv = request->i915;
+		struct intel_context *ce = &request->ctx->engine[engine->id];
+		struct i915_hw_ppgtt *ppgtt;
+		/* FIXME consider gen8 reset */
+		if (ce->state) {
+			I915_WRITE(CCID,
+				   i915_ggtt_offset(ce->state) |
+				   BIT(8) /* must be set! */ |
+				   CCID_EXTENDED_STATE_SAVE |
+				   CCID_EXTENDED_STATE_RESTORE |
+				   CCID_EN);
+		}
+		ppgtt = request->ctx->ppgtt ?: engine->i915->mm.aliasing_ppgtt;
+		if (ppgtt) {
+			u32 pd_offset = ppgtt->pd.base.ggtt_offset << 10;
+			I915_WRITE(RING_PP_DIR_DCLV(engine), PP_DIR_DCLV_2G);
+			I915_WRITE(RING_PP_DIR_BASE(engine), pd_offset);
+			/* Wait for the PD reload to complete */
+			if (intel_wait_for_register(dev_priv,
+						    RING_PP_DIR_BASE(engine),
+						    BIT(0), 0,
+						    10))
+				DRM_ERROR("Wait for reload of ppgtt page-directory timed out\n");
+			ppgtt->pd_dirty_rings &= ~intel_engine_flag(engine);
+		}
+		/* If the rq hung, jump to its breadcrumb and skip the batch */
+		if (request->fence.error == -EIO) {
 			struct intel_ring *ring = request->ring;
 			ring->head = request->postfix;
 			ring->last_retired_head = -1;
+		}
+	} else {
+		engine->legacy_active_context = NULL;
+	}
 }
 static int intel_ring_workarounds_emit(struct drm_i915_gem_request *req)