diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 306d9e4e5cf376bc57901d665622a8b16fdce4d9..430571b977db9d771a2ff15ae543526e3a321418 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -131,7 +131,7 @@ static const struct drm_i915_cmd_descriptor common_cmds[] = {
 			.mask = MI_GLOBAL_GTT,
 			.expected = 0,
 	      }},						       ),
-	CMD(  MI_LOAD_REGISTER_MEM,             SMI,   !F,  0xFF,   W | B,
+	CMD(  MI_LOAD_REGISTER_MEM(1),             SMI,   !F,  0xFF,   W | B,
 	      .reg = { .offset = 1, .mask = 0x007FFFFC },
 	      .bits = {{
 			.offset = 0,
@@ -1021,7 +1021,7 @@ static bool check_cmd(const struct intel_engine_cs *ring,
 			 * only MI_LOAD_REGISTER_IMM commands.
 			 */
 			if (reg_addr == OACONTROL) {
-				if (desc->cmd.value == MI_LOAD_REGISTER_MEM) {
+				if (desc->cmd.value == MI_LOAD_REGISTER_MEM(1)) {
 					DRM_DEBUG_DRIVER("CMD: Rejected LRM to OACONTROL\n");
 					return false;
 				}
@@ -1035,7 +1035,7 @@ static bool check_cmd(const struct intel_engine_cs *ring,
 			 * allowed mask/value pair given in the whitelist entry.
 			 */
 			if (reg->mask) {
-				if (desc->cmd.value == MI_LOAD_REGISTER_MEM) {
+				if (desc->cmd.value == MI_LOAD_REGISTER_MEM(1)) {
 					DRM_DEBUG_DRIVER("CMD: Rejected LRM to masked register 0x%08X\n",
 							 reg_addr);
 					return false;
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 45ff3d3e79c8111a61629409732c667434b8627e..1c4d7894b429a880bffa5c9cea38c9600730ebbd 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -354,6 +354,8 @@
 #define   MI_INVALIDATE_BSD		(1<<7)
 #define   MI_FLUSH_DW_USE_GTT		(1<<2)
 #define   MI_FLUSH_DW_USE_PPGTT		(0<<2)
+#define MI_LOAD_REGISTER_MEM(x) MI_INSTR(0x29, 2*(x)-1)
+#define MI_LOAD_REGISTER_MEM_GEN8(x) MI_INSTR(0x29, 3*(x)-1)
 #define MI_BATCH_BUFFER		MI_INSTR(0x30, 1)
 #define   MI_BATCH_NON_SECURE		(1)
 /* for snb/ivb/vlv this also means "batch in ppgtt" when ppgtt is enabled. */
@@ -459,7 +461,6 @@
 #define MI_CLFLUSH              MI_INSTR(0x27, 0)
 #define MI_REPORT_PERF_COUNT    MI_INSTR(0x28, 0)
 #define   MI_REPORT_PERF_COUNT_GGTT (1<<0)
-#define MI_LOAD_REGISTER_MEM    MI_INSTR(0x29, 0)
 #define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 0)
 #define MI_RS_STORE_DATA_IMM    MI_INSTR(0x2B, 0)
 #define MI_LOAD_URB_MEM         MI_INSTR(0x2C, 0)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 0160bec1e7ba798a9104c697db3f2cdabfbbf938..a499f16db194b4471520ff0d9c02978ade1fa110 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1092,6 +1092,56 @@ static int intel_logical_ring_workarounds_emit(struct drm_i915_gem_request *req)
 		batch[index++] = (cmd);					\
 	} while (0)
 
+
+/*
+ * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
+ * PIPE_CONTROL instruction. This is required for the flush to happen correctly
+ * but there is a slight complication as this is applied in WA batch where the
+ * values are only initialized once so we cannot take register value at the
+ * beginning and reuse it further; hence we save its value to memory, upload a
+ * constant value with bit21 set and then we restore it back with the saved value.
+ * To simplify the WA, a constant value is formed by using the default value
+ * of this register. This shouldn't be a problem because we are only modifying
+ * it for a short period and this batch in non-premptible. We can ofcourse
+ * use additional instructions that read the actual value of the register
+ * at that time and set our bit of interest but it makes the WA complicated.
+ *
+ * This WA is also required for Gen9 so extracting as a function avoids
+ * code duplication.
+ */
+static inline int gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *ring,
+						uint32_t *const batch,
+						uint32_t index)
+{
+	uint32_t l3sqc4_flush = (0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES);
+
+	wa_ctx_emit(batch, (MI_STORE_REGISTER_MEM_GEN8(1) |
+			    MI_SRM_LRM_GLOBAL_GTT));
+	wa_ctx_emit(batch, GEN8_L3SQCREG4);
+	wa_ctx_emit(batch, ring->scratch.gtt_offset + 256);
+	wa_ctx_emit(batch, 0);
+
+	wa_ctx_emit(batch, MI_LOAD_REGISTER_IMM(1));
+	wa_ctx_emit(batch, GEN8_L3SQCREG4);
+	wa_ctx_emit(batch, l3sqc4_flush);
+
+	wa_ctx_emit(batch, GFX_OP_PIPE_CONTROL(6));
+	wa_ctx_emit(batch, (PIPE_CONTROL_CS_STALL |
+			    PIPE_CONTROL_DC_FLUSH_ENABLE));
+	wa_ctx_emit(batch, 0);
+	wa_ctx_emit(batch, 0);
+	wa_ctx_emit(batch, 0);
+	wa_ctx_emit(batch, 0);
+
+	wa_ctx_emit(batch, (MI_LOAD_REGISTER_MEM_GEN8(1) |
+			    MI_SRM_LRM_GLOBAL_GTT));
+	wa_ctx_emit(batch, GEN8_L3SQCREG4);
+	wa_ctx_emit(batch, ring->scratch.gtt_offset + 256);
+	wa_ctx_emit(batch, 0);
+
+	return index;
+}
+
 static inline uint32_t wa_ctx_start(struct i915_wa_ctx_bb *wa_ctx,
 				    uint32_t offset,
 				    uint32_t start_alignment)
@@ -1152,25 +1202,9 @@ static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring,
 
 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
 	if (IS_BROADWELL(ring->dev)) {
-		struct drm_i915_private *dev_priv = to_i915(ring->dev);
-		uint32_t l3sqc4_flush = (I915_READ(GEN8_L3SQCREG4) |
-					 GEN8_LQSC_FLUSH_COHERENT_LINES);
-
-		wa_ctx_emit(batch, MI_LOAD_REGISTER_IMM(1));
-		wa_ctx_emit(batch, GEN8_L3SQCREG4);
-		wa_ctx_emit(batch, l3sqc4_flush);
-
-		wa_ctx_emit(batch, GFX_OP_PIPE_CONTROL(6));
-		wa_ctx_emit(batch, (PIPE_CONTROL_CS_STALL |
-				    PIPE_CONTROL_DC_FLUSH_ENABLE));
-		wa_ctx_emit(batch, 0);
-		wa_ctx_emit(batch, 0);
-		wa_ctx_emit(batch, 0);
-		wa_ctx_emit(batch, 0);
-
-		wa_ctx_emit(batch, MI_LOAD_REGISTER_IMM(1));
-		wa_ctx_emit(batch, GEN8_L3SQCREG4);
-		wa_ctx_emit(batch, l3sqc4_flush & ~GEN8_LQSC_FLUSH_COHERENT_LINES);
+		index = gen8_emit_flush_coherentl3_wa(ring, batch, index);
+		if (index < 0)
+			return index;
 	}
 
 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */