diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 026cb52ece0b7e34db950c3d9a2128d46c290383..94b23fcbc989d35e3e24467c32f53ff3839cfc29 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4603,11 +4603,17 @@ static void __i915_gem_free_work(struct work_struct *work)
 	 * unbound now.
 	 */
 
+	spin_lock(&i915->mm.free_lock);
 	while ((freed = llist_del_all(&i915->mm.free_list))) {
+		spin_unlock(&i915->mm.free_lock);
+
 		__i915_gem_free_objects(i915, freed);
 		if (need_resched())
-			break;
+			return;
+
+		spin_lock(&i915->mm.free_lock);
 	}
+	spin_unlock(&i915->mm.free_lock);
 }
 
 static void __i915_gem_free_object_rcu(struct rcu_head *head)
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 5bf96a258509989e26b1ea4397ad8f8e7ffd9d81..e304dcbc6042d9e846efda1139f145b338056ac8 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -106,14 +106,9 @@ static void lut_close(struct i915_gem_context *ctx)
 
 	radix_tree_for_each_slot(slot, &ctx->handles_vma, &iter, 0) {
 		struct i915_vma *vma = rcu_dereference_raw(*slot);
-		struct drm_i915_gem_object *obj = vma->obj;
 
 		radix_tree_iter_delete(&ctx->handles_vma, &iter, slot);
-
-		if (!i915_vma_is_ggtt(vma))
-			i915_vma_close(vma);
-
-		__i915_gem_object_release_unless_active(obj);
+		__i915_gem_object_release_unless_active(vma->obj);
 	}
 }
 
@@ -198,6 +193,11 @@ static void context_close(struct i915_gem_context *ctx)
 {
 	i915_gem_context_set_closed(ctx);
 
+	/*
+	 * The LUT uses the VMA as a backpointer to unref the object,
+	 * so we need to clear the LUT before we close all the VMA (inside
+	 * the ppgtt).
+	 */
 	lut_close(ctx);
 	if (ctx->ppgtt)
 		i915_ppgtt_close(&ctx->ppgtt->base);
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 527a2d2d62810a6bb2f962a8190188947a6da85b..5eaa6893daaa1a193eda7235e5acf8e9adbda87c 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1341,7 +1341,7 @@ static int gen8_ppgtt_alloc_pd(struct i915_address_space *vm,
 			if (IS_ERR(pt))
 				goto unwind;
 
-			if (count < GEN8_PTES)
+			if (count < GEN8_PTES || intel_vgpu_active(vm->i915))
 				gen8_initialize_pt(vm, pt);
 
 			gen8_ppgtt_set_pde(vm, pd, pt, pde);
diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
index eb31f8aa5c216752f3c0e6c11e6ce56a3ee32a55..3770e3323fc8e932b63fc94e7a0353a0d1d6784c 100644
--- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
+++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
@@ -162,6 +162,18 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
 	if (!shrinker_lock(dev_priv, &unlock))
 		return 0;
 
+	/*
+	 * When shrinking the active list, also consider active contexts.
+	 * Active contexts are pinned until they are retired, and so can
+	 * not be simply unbound to retire and unpin their pages. To shrink
+	 * the contexts, we must wait until the gpu is idle.
+	 *
+	 * We don't care about errors here; if we cannot wait upon the GPU,
+	 * we will free as much as we can and hope to get a second chance.
+	 */
+	if (flags & I915_SHRINK_ACTIVE)
+		i915_gem_wait_for_idle(dev_priv, I915_WAIT_LOCKED);
+
 	trace_i915_gem_shrink(dev_priv, target, flags);
 	i915_gem_retire_requests(dev_priv);
 
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index a2e8114b739d9182bf005a35514b83e397c29790..f84c267728fdaa2959c0a1512fc6978b15dcfa61 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -610,6 +610,7 @@ static void i915_guc_dequeue(struct intel_engine_cs *engine)
 	execlists->first = rb;
 	if (submit) {
 		port_assign(port, last);
+		execlists_set_active(execlists, EXECLISTS_ACTIVE_USER);
 		i915_guc_submit(engine);
 	}
 	spin_unlock_irq(&engine->timeline->lock);
@@ -633,6 +634,8 @@ static void i915_guc_irq_handler(unsigned long data)
 
 		rq = port_request(&port[0]);
 	}
+	if (!rq)
+		execlists_clear_active(execlists, EXECLISTS_ACTIVE_USER);
 
 	if (!port_isset(last_port))
 		i915_guc_dequeue(engine);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index b1296a55c1e4a8ee3caf7471e51baf19fa4a8f5e..f8205841868bcb5816273de628ef6bce53b0bfef 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1388,8 +1388,10 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
 	bool tasklet = false;
 
 	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift)) {
-		__set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
-		tasklet = true;
+		if (READ_ONCE(engine->execlists.active)) {
+			__set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
+			tasklet = true;
+		}
 	}
 
 	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index fc77e8191eb55fc54ab31a28d7017f7e4aaeff9d..fbfab2f3302326cafdfbd7443e76970ab68f5c7a 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -54,6 +54,13 @@ i915_vma_retire(struct i915_gem_active *active,
 	if (--obj->active_count)
 		return;
 
+	/* Prune the shared fence arrays iff completely idle (inc. external) */
+	if (reservation_object_trylock(obj->resv)) {
+		if (reservation_object_test_signaled_rcu(obj->resv, true))
+			reservation_object_add_excl_fence(obj->resv, NULL);
+		reservation_object_unlock(obj->resv);
+	}
+
 	/* Bump our place on the bound list to keep it roughly in LRU order
 	 * so that we don't steal from recently used but inactive objects
 	 * (unless we are forced to ofc!)
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index e2ac976844d807a968a7355d1a69bcd89529fd15..f4a9a182868f50dc0cd3f04c69de73909e94216a 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -3676,6 +3676,7 @@ void intel_finish_reset(struct drm_i915_private *dev_priv)
 
 		intel_pps_unlock_regs_wa(dev_priv);
 		intel_modeset_init_hw(dev);
+		intel_init_clock_gating(dev_priv);
 
 		spin_lock_irq(&dev_priv->irq_lock);
 		if (dev_priv->display.hpd_irq_setup)
@@ -14350,8 +14351,6 @@ void intel_modeset_init_hw(struct drm_device *dev)
 
 	intel_update_cdclk(dev_priv);
 	dev_priv->cdclk.logical = dev_priv->cdclk.actual = dev_priv->cdclk.hw;
-
-	intel_init_clock_gating(dev_priv);
 }
 
 /*
@@ -15063,6 +15062,15 @@ intel_modeset_setup_hw_state(struct drm_device *dev,
 	struct intel_encoder *encoder;
 	int i;
 
+	if (IS_HASWELL(dev_priv)) {
+		/*
+		 * WaRsPkgCStateDisplayPMReq:hsw
+		 * System hang if this isn't done before disabling all planes!
+		 */
+		I915_WRITE(CHICKEN_PAR1_1,
+			   I915_READ(CHICKEN_PAR1_1) | FORCE_ARB_IDLE_PLANES);
+	}
+
 	intel_modeset_readout_hw_state(dev);
 
 	/* HW state is read out, now we need to sanitize this mess. */
@@ -15160,6 +15168,8 @@ void intel_modeset_gem_init(struct drm_device *dev)
 
 	intel_init_gt_powersave(dev_priv);
 
+	intel_init_clock_gating(dev_priv);
+
 	intel_setup_overlay(dev_priv);
 }
 
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index a47a9c6bea5212309da71cd6f45dde23664e3b45..ab5bf4e2e28e21f3daa7072a37cb6895aba8233c 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1548,8 +1548,8 @@ bool intel_engine_is_idle(struct intel_engine_cs *engine)
 	if (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted))
 		return false;
 
-	/* Both ports drained, no more ELSP submission? */
-	if (port_request(&engine->execlists.port[0]))
+	/* Waiting to drain ELSP? */
+	if (READ_ONCE(engine->execlists.active))
 		return false;
 
 	/* ELSP is empty, but there are ready requests? */
@@ -1749,6 +1749,7 @@ void intel_engine_dump(struct intel_engine_cs *engine, struct drm_printer *m)
 					   idx);
 			}
 		}
+		drm_printf(m, "\t\tHW active? 0x%x\n", execlists->active);
 		rcu_read_unlock();
 	} else if (INTEL_GEN(dev_priv) > 6) {
 		drm_printf(m, "\tPP_DIR_BASE: 0x%08x\n",
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 7f45dd7dc3e50cb35f0c7de71231a1e0164c388d..d36e2560743545cef7b32b24adb794e719ca5da2 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -575,7 +575,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 			 * the state of the GPU is known (idle).
 			 */
 			inject_preempt_context(engine);
-			execlists->preempt = true;
+			execlists_set_active(execlists,
+					     EXECLISTS_ACTIVE_PREEMPT);
 			goto unlock;
 		} else {
 			/*
@@ -683,8 +684,10 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 unlock:
 	spin_unlock_irq(&engine->timeline->lock);
 
-	if (submit)
+	if (submit) {
+		execlists_set_active(execlists, EXECLISTS_ACTIVE_USER);
 		execlists_submit_ports(engine);
+	}
 }
 
 static void
@@ -696,6 +699,7 @@ execlist_cancel_port_requests(struct intel_engine_execlists *execlists)
 	while (num_ports-- && port_isset(port)) {
 		struct drm_i915_gem_request *rq = port_request(port);
 
+		GEM_BUG_ON(!execlists->active);
 		execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_PREEMPTED);
 		i915_gem_request_put(rq);
 
@@ -730,7 +734,6 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
 
 		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
 			INIT_LIST_HEAD(&rq->priotree.link);
-			rq->priotree.priority = INT_MAX;
 
 			dma_fence_set_error(&rq->fence, -EIO);
 			__i915_gem_request_submit(rq);
@@ -861,15 +864,21 @@ static void intel_lrc_irq_handler(unsigned long data)
 				unwind_incomplete_requests(engine);
 				spin_unlock_irq(&engine->timeline->lock);
 
-				GEM_BUG_ON(!execlists->preempt);
-				execlists->preempt = false;
+				GEM_BUG_ON(!execlists_is_active(execlists,
+								EXECLISTS_ACTIVE_PREEMPT));
+				execlists_clear_active(execlists,
+						       EXECLISTS_ACTIVE_PREEMPT);
 				continue;
 			}
 
 			if (status & GEN8_CTX_STATUS_PREEMPTED &&
-			    execlists->preempt)
+			    execlists_is_active(execlists,
+						EXECLISTS_ACTIVE_PREEMPT))
 				continue;
 
+			GEM_BUG_ON(!execlists_is_active(execlists,
+							EXECLISTS_ACTIVE_USER));
+
 			/* Check the context/desc id for this event matches */
 			GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
 
@@ -881,7 +890,6 @@ static void intel_lrc_irq_handler(unsigned long data)
 				execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
 
 				trace_i915_gem_request_out(rq);
-				rq->priotree.priority = INT_MAX;
 				i915_gem_request_put(rq);
 
 				execlists_port_complete(execlists, port);
@@ -892,6 +900,9 @@ static void intel_lrc_irq_handler(unsigned long data)
 			/* After the final element, the hw should be idle */
 			GEM_BUG_ON(port_count(port) == 0 &&
 				   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
+			if (port_count(port) == 0)
+				execlists_clear_active(execlists,
+						       EXECLISTS_ACTIVE_USER);
 		}
 
 		if (head != execlists->csb_head) {
@@ -901,7 +912,7 @@ static void intel_lrc_irq_handler(unsigned long data)
 		}
 	}
 
-	if (!execlists->preempt)
+	if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT))
 		execlists_dequeue(engine);
 
 	intel_uncore_forcewake_put(dev_priv, execlists->fw_domains);
@@ -1460,7 +1471,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
 		   GT_CONTEXT_SWITCH_INTERRUPT << engine->irq_shift);
 	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
 	execlists->csb_head = -1;
-	execlists->preempt = false;
+	execlists->active = 0;
 
 	/* After a GPU reset, we may have requests to replay */
 	if (!i915_modparams.enable_guc_submission && execlists->first)
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index c42a65a93b3a901f686ca5ad15b4fe60d6751c93..aa12a44e9a76b1cf8419dccd3494d1bda76f6e5b 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -3133,7 +3133,11 @@ static int ilk_compute_intermediate_wm(struct drm_device *dev,
 				       struct intel_crtc_state *newstate)
 {
 	struct intel_pipe_wm *a = &newstate->wm.ilk.intermediate;
-	struct intel_pipe_wm *b = &intel_crtc->wm.active.ilk;
+	struct intel_atomic_state *intel_state =
+		to_intel_atomic_state(newstate->base.state);
+	const struct intel_crtc_state *oldstate =
+		intel_atomic_get_old_crtc_state(intel_state, intel_crtc);
+	const struct intel_pipe_wm *b = &oldstate->wm.ilk.optimal;
 	int level, max_level = ilk_wm_max_level(to_i915(dev));
 
 	/*
@@ -3142,6 +3146,9 @@ static int ilk_compute_intermediate_wm(struct drm_device *dev,
 	 * and after the vblank.
 	 */
 	*a = newstate->wm.ilk.optimal;
+	if (!newstate->base.active || drm_atomic_crtc_needs_modeset(&newstate->base))
+		return 0;
+
 	a->pipe_enabled |= b->pipe_enabled;
 	a->sprites_enabled |= b->sprites_enabled;
 	a->sprites_scaled |= b->sprites_scaled;
@@ -5755,12 +5762,30 @@ void vlv_wm_sanitize(struct drm_i915_private *dev_priv)
 	mutex_unlock(&dev_priv->wm.wm_mutex);
 }
 
+/*
+ * FIXME should probably kill this and improve
+ * the real watermark readout/sanitation instead
+ */
+static void ilk_init_lp_watermarks(struct drm_i915_private *dev_priv)
+{
+	I915_WRITE(WM3_LP_ILK, I915_READ(WM3_LP_ILK) & ~WM1_LP_SR_EN);
+	I915_WRITE(WM2_LP_ILK, I915_READ(WM2_LP_ILK) & ~WM1_LP_SR_EN);
+	I915_WRITE(WM1_LP_ILK, I915_READ(WM1_LP_ILK) & ~WM1_LP_SR_EN);
+
+	/*
+	 * Don't touch WM1S_LP_EN here.
+	 * Doing so could cause underruns.
+	 */
+}
+
 void ilk_wm_get_hw_state(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct ilk_wm_values *hw = &dev_priv->wm.hw;
 	struct drm_crtc *crtc;
 
+	ilk_init_lp_watermarks(dev_priv);
+
 	for_each_crtc(dev, crtc)
 		ilk_pipe_wm_get_hw_state(crtc);
 
@@ -8207,18 +8232,6 @@ static void g4x_disable_trickle_feed(struct drm_i915_private *dev_priv)
 	}
 }
 
-static void ilk_init_lp_watermarks(struct drm_i915_private *dev_priv)
-{
-	I915_WRITE(WM3_LP_ILK, I915_READ(WM3_LP_ILK) & ~WM1_LP_SR_EN);
-	I915_WRITE(WM2_LP_ILK, I915_READ(WM2_LP_ILK) & ~WM1_LP_SR_EN);
-	I915_WRITE(WM1_LP_ILK, I915_READ(WM1_LP_ILK) & ~WM1_LP_SR_EN);
-
-	/*
-	 * Don't touch WM1S_LP_EN here.
-	 * Doing so could cause underruns.
-	 */
-}
-
 static void ilk_init_clock_gating(struct drm_i915_private *dev_priv)
 {
 	uint32_t dspclk_gate = ILK_VRHUNIT_CLOCK_GATE_DISABLE;
@@ -8252,8 +8265,6 @@ static void ilk_init_clock_gating(struct drm_i915_private *dev_priv)
 		   (I915_READ(DISP_ARB_CTL) |
 		    DISP_FBC_WM_DIS));
 
-	ilk_init_lp_watermarks(dev_priv);
-
 	/*
 	 * Based on the document from hardware guys the following bits
 	 * should be set unconditionally in order to enable FBC.
@@ -8366,8 +8377,6 @@ static void gen6_init_clock_gating(struct drm_i915_private *dev_priv)
 	I915_WRITE(GEN6_GT_MODE,
 		   _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4));
 
-	ilk_init_lp_watermarks(dev_priv);
-
 	I915_WRITE(CACHE_MODE_0,
 		   _MASKED_BIT_DISABLE(CM0_STC_EVICT_DISABLE_LRA_SNB));
 
@@ -8594,8 +8603,6 @@ static void bdw_init_clock_gating(struct drm_i915_private *dev_priv)
 						 I915_GTT_PAGE_SIZE_2M);
 	enum pipe pipe;
 
-	ilk_init_lp_watermarks(dev_priv);
-
 	/* WaSwitchSolVfFArbitrationPriority:bdw */
 	I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) | HSW_ECOCHK_ARB_PRIO_SOL);
 
@@ -8646,8 +8653,6 @@ static void bdw_init_clock_gating(struct drm_i915_private *dev_priv)
 
 static void hsw_init_clock_gating(struct drm_i915_private *dev_priv)
 {
-	ilk_init_lp_watermarks(dev_priv);
-
 	/* L3 caching of data atomics doesn't work -- disable it. */
 	I915_WRITE(HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE);
 	I915_WRITE(HSW_ROW_CHICKEN3,
@@ -8691,10 +8696,6 @@ static void hsw_init_clock_gating(struct drm_i915_private *dev_priv)
 	/* WaSwitchSolVfFArbitrationPriority:hsw */
 	I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) | HSW_ECOCHK_ARB_PRIO_SOL);
 
-	/* WaRsPkgCStateDisplayPMReq:hsw */
-	I915_WRITE(CHICKEN_PAR1_1,
-		   I915_READ(CHICKEN_PAR1_1) | FORCE_ARB_IDLE_PLANES);
-
 	lpt_init_clock_gating(dev_priv);
 }
 
@@ -8702,8 +8703,6 @@ static void ivb_init_clock_gating(struct drm_i915_private *dev_priv)
 {
 	uint32_t snpcr;
 
-	ilk_init_lp_watermarks(dev_priv);
-
 	I915_WRITE(ILK_DSPCLK_GATE_D, ILK_VRHUNIT_CLOCK_GATE_DISABLE);
 
 	/* WaDisableEarlyCull:ivb */
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 17186f067408587543e8f7955e00f4d86c51cb72..6a42ed618a28ce1f47b2aeca7263cb9f561a1a86 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -241,9 +241,17 @@ struct intel_engine_execlists {
 	} port[EXECLIST_MAX_PORTS];
 
 	/**
-	 * @preempt: are we currently handling a preempting context switch?
+	 * @active: is the HW active? We consider the HW as active after
+	 * submitting any context for execution and until we have seen the
+	 * last context completion event. After that, we do not expect any
+	 * more events until we submit, and so can park the HW.
+	 *
+	 * As we have a small number of different sources from which we feed
+	 * the HW, we track the state of each inside a single bitfield.
 	 */
-	bool preempt;
+	unsigned int active;
+#define EXECLISTS_ACTIVE_USER 0
+#define EXECLISTS_ACTIVE_PREEMPT 1
 
 	/**
 	 * @port_mask: number of execlist ports - 1
@@ -525,6 +533,27 @@ struct intel_engine_cs {
 	u32 (*get_cmd_length_mask)(u32 cmd_header);
 };
 
+static inline void
+execlists_set_active(struct intel_engine_execlists *execlists,
+		     unsigned int bit)
+{
+	__set_bit(bit, (unsigned long *)&execlists->active);
+}
+
+static inline void
+execlists_clear_active(struct intel_engine_execlists *execlists,
+		       unsigned int bit)
+{
+	__clear_bit(bit, (unsigned long *)&execlists->active);
+}
+
+static inline bool
+execlists_is_active(const struct intel_engine_execlists *execlists,
+		    unsigned int bit)
+{
+	return test_bit(bit, (unsigned long *)&execlists->active);
+}
+
 static inline unsigned int
 execlists_num_ports(const struct intel_engine_execlists * const execlists)
 {
@@ -538,6 +567,7 @@ execlists_port_complete(struct intel_engine_execlists * const execlists,
 	const unsigned int m = execlists->port_mask;
 
 	GEM_BUG_ON(port_index(port, execlists) != 0);
+	GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_USER));
 
 	memmove(port, port + 1, m * sizeof(struct execlist_port));
 	memset(port + m, 0, sizeof(struct execlist_port));