提交 86d7f238 编写于 作者: A Arun Siluvery 提交者: Daniel Vetter

drm/i915/bdw: Apply workarounds in render ring init function

For BDW workarounds are currently initialized in init_clock_gating() but
they are lost during reset, suspend/resume etc; this patch moves the WAs
that are part of register state context to render ring init fn otherwise
default context ends up with incorrect values as they don't get initialized
until init_clock_gating fn.

v2: Add workarounds to golden render state
This method has its own issues, first of all this is different for
each gen and it is generated using a tool so adding new workaround
and mainitaining them across gens is not a straightforward process.

v3: Use LRIs to emit these workarounds (Ville)
Instead of modifying the golden render state the same LRIs are
emitted from within the driver.

v4: Use abstract name when exporting gen specific routines (Chris)

For: VIZ-4092
Signed-off-by: NArun Siluvery <arun.siluvery@linux.intel.com>
Reviewed-by: NVille Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: NDaniel Vetter <daniel.vetter@ffwll.ch>
上级 c5ad011d
......@@ -628,6 +628,12 @@ static int do_switch(struct intel_engine_cs *ring,
ring->last_context = to;
if (uninitialized) {
if (ring->init_context) {
ret = ring->init_context(ring);
if (ret)
DRM_ERROR("ring init context: %d\n", ret);
}
ret = i915_gem_render_state_init(ring);
if (ret)
DRM_ERROR("init render state: %d\n", ret);
......
......@@ -5536,37 +5536,12 @@ static void broadwell_init_clock_gating(struct drm_device *dev)
/* FIXME(BDW): Check all the w/a, some might only apply to
* pre-production hw. */
/* WaDisablePartialInstShootdown:bdw */
I915_WRITE(GEN8_ROW_CHICKEN,
_MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE));
/* WaDisableThreadStallDopClockGating:bdw */
/* FIXME: Unclear whether we really need this on production bdw. */
I915_WRITE(GEN8_ROW_CHICKEN,
_MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE));
/*
* This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for
* pre-production hardware
*/
I915_WRITE(HALF_SLICE_CHICKEN3,
_MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS));
I915_WRITE(HALF_SLICE_CHICKEN3,
_MASKED_BIT_ENABLE(GEN8_SAMPLER_POWER_BYPASS_DIS));
I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_BWGTLB_DISABLE));
I915_WRITE(_3D_CHICKEN3,
_MASKED_BIT_ENABLE(_3D_CHICKEN_SDE_LIMIT_FIFO_POLY_DEPTH(2)));
I915_WRITE(COMMON_SLICE_CHICKEN2,
_MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE));
I915_WRITE(GEN7_HALF_SLICE_CHICKEN1,
_MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE));
/* WaDisableDopClockGating:bdw May not be needed for production */
I915_WRITE(GEN7_ROW_CHICKEN2,
_MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
/* WaSwitchSolVfFArbitrationPriority:bdw */
I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) | HSW_ECOCHK_ARB_PRIO_SOL);
......@@ -5582,31 +5557,12 @@ static void broadwell_init_clock_gating(struct drm_device *dev)
BDW_DPRS_MASK_VBLANK_SRD);
}
/* Use Force Non-Coherent whenever executing a 3D context. This is a
* workaround for for a possible hang in the unlikely event a TLB
* invalidation occurs during a PSD flush.
*/
I915_WRITE(HDC_CHICKEN0,
I915_READ(HDC_CHICKEN0) |
_MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT));
/* WaVSRefCountFullforceMissDisable:bdw */
/* WaDSRefCountFullforceMissDisable:bdw */
I915_WRITE(GEN7_FF_THREAD_MODE,
I915_READ(GEN7_FF_THREAD_MODE) &
~(GEN8_FF_DS_REF_CNT_FFME | GEN7_FF_VS_REF_CNT_FFME));
/*
* BSpec recommends 8x4 when MSAA is used,
* however in practice 16x4 seems fastest.
*
* Note that PS/WM thread counts depend on the WIZ hashing
* disable bit, which we don't touch here, but it's good
* to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
*/
I915_WRITE(GEN7_GT_MODE,
GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4);
I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL,
_MASKED_BIT_ENABLE(GEN8_RC_SEMA_IDLE_MSG_DISABLE));
......@@ -5614,10 +5570,6 @@ static void broadwell_init_clock_gating(struct drm_device *dev)
I915_WRITE(GEN8_UCGCTL6, I915_READ(GEN8_UCGCTL6) |
GEN8_SDEUNIT_CLOCK_GATE_DISABLE);
/* Wa4x4STCOptimizationDisable:bdw */
I915_WRITE(CACHE_MODE_1,
_MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE));
lpt_init_clock_gating(dev);
}
......
......@@ -657,6 +657,84 @@ intel_init_pipe_control(struct intel_engine_cs *ring)
return ret;
}
static inline void intel_ring_emit_wa(struct intel_engine_cs *ring,
u32 addr, u32 value)
{
intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
intel_ring_emit(ring, addr);
intel_ring_emit(ring, value);
}
static int gen8_init_workarounds(struct intel_engine_cs *ring)
{
int ret;
/*
* workarounds applied in this fn are part of register state context,
* they need to be re-initialized followed by gpu reset, suspend/resume,
* module reload.
*/
/*
* update the number of dwords required based on the
* actual number of workarounds applied
*/
ret = intel_ring_begin(ring, 24);
if (ret)
return ret;
/* WaDisablePartialInstShootdown:bdw */
/* WaDisableThreadStallDopClockGating:bdw */
/* FIXME: Unclear whether we really need this on production bdw. */
intel_ring_emit_wa(ring, GEN8_ROW_CHICKEN,
_MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE
| STALL_DOP_GATING_DISABLE));
/* WaDisableDopClockGating:bdw May not be needed for production */
intel_ring_emit_wa(ring, GEN7_ROW_CHICKEN2,
_MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
/*
* This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for
* pre-production hardware
*/
intel_ring_emit_wa(ring, HALF_SLICE_CHICKEN3,
_MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS
| GEN8_SAMPLER_POWER_BYPASS_DIS));
intel_ring_emit_wa(ring, GEN7_HALF_SLICE_CHICKEN1,
_MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE));
intel_ring_emit_wa(ring, COMMON_SLICE_CHICKEN2,
_MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE));
/* Use Force Non-Coherent whenever executing a 3D context. This is a
* workaround for for a possible hang in the unlikely event a TLB
* invalidation occurs during a PSD flush.
*/
intel_ring_emit_wa(ring, HDC_CHICKEN0,
_MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT));
/* Wa4x4STCOptimizationDisable:bdw */
intel_ring_emit_wa(ring, CACHE_MODE_1,
_MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE));
/*
* BSpec recommends 8x4 when MSAA is used,
* however in practice 16x4 seems fastest.
*
* Note that PS/WM thread counts depend on the WIZ hashing
* disable bit, which we don't touch here, but it's good
* to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
*/
intel_ring_emit_wa(ring, GEN7_GT_MODE,
GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4);
intel_ring_advance(ring);
return 0;
}
static int init_render_ring(struct intel_engine_cs *ring)
{
struct drm_device *dev = ring->dev;
......@@ -2143,6 +2221,7 @@ int intel_init_render_ring_buffer(struct drm_device *dev)
dev_priv->semaphore_obj = obj;
}
}
ring->init_context = gen8_init_workarounds;
ring->add_request = gen6_add_request;
ring->flush = gen8_render_ring_flush;
ring->irq_get = gen8_ring_get_irq;
......
......@@ -148,6 +148,8 @@ struct intel_engine_cs {
int (*init)(struct intel_engine_cs *ring);
int (*init_context)(struct intel_engine_cs *ring);
void (*write_tail)(struct intel_engine_cs *ring,
u32 value);
int __must_check (*flush)(struct intel_engine_cs *ring,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册