提交 9107e9d2 编写于 作者: C Chris Wilson 提交者: Daniel Vetter

drm/i915: Only slightly increment hangcheck score if we succesfully kick a ring

After kicking a ring, it should be free to make progress again and so
should not be accused of being stuck until hangcheck fires once more. In
order to catch a denial-of-service within a batch or across multiple
batches, we still do increment the hangcheck score - just not as
severely so that it takes multiple kicks to fail.

This should address part of Ben's justified criticism of

commit 05407ff8
Author: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Date:   Thu May 30 09:04:29 2013 +0300

    drm/i915: detect hang using per ring hangcheck_score

"There's also another corner case on the kick. If the seqno = 2
(though not stuck), and on the 3rd hangcheck, the ring is stuck, and
we try to kick it... we don't actually try to find out if the kick
helped."

v2: Make sure we catch DoS attempts with batches full of invalid WAITs.
v3: Preserve the ability to detect loops by always charging the ring
    if it is busy on the same request.
v4: Make sure we queue another check if on a new batch

References: https://bugs.freedesktop.org/show_bug.cgi?id=65394Signed-off-by: NChris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Ben Widawsky <ben@bwidawsk.net>
Reviewed-by: NMika Kuoppala <mika.kuoppala@intel.com>
Signed-off-by: NDaniel Vetter <daniel.vetter@ffwll.ch>
上级 50f018df
...@@ -2314,21 +2314,11 @@ ring_last_seqno(struct intel_ring_buffer *ring) ...@@ -2314,21 +2314,11 @@ ring_last_seqno(struct intel_ring_buffer *ring)
struct drm_i915_gem_request, list)->seqno; struct drm_i915_gem_request, list)->seqno;
} }
static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring, static bool
u32 ring_seqno, bool *err) ring_idle(struct intel_ring_buffer *ring, u32 seqno)
{ {
if (list_empty(&ring->request_list) || return (list_empty(&ring->request_list) ||
i915_seqno_passed(ring_seqno, ring_last_seqno(ring))) { i915_seqno_passed(seqno, ring_last_seqno(ring)));
/* Issue a wake-up to catch stuck h/w. */
if (waitqueue_active(&ring->irq_queue)) {
DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
ring->name);
wake_up_all(&ring->irq_queue);
*err = true;
}
return true;
}
return false;
} }
static bool semaphore_passed(struct intel_ring_buffer *ring) static bool semaphore_passed(struct intel_ring_buffer *ring)
...@@ -2362,16 +2352,26 @@ static bool semaphore_passed(struct intel_ring_buffer *ring) ...@@ -2362,16 +2352,26 @@ static bool semaphore_passed(struct intel_ring_buffer *ring)
ioread32(ring->virtual_start+acthd+4)+1); ioread32(ring->virtual_start+acthd+4)+1);
} }
static bool kick_ring(struct intel_ring_buffer *ring) static bool ring_hung(struct intel_ring_buffer *ring)
{ {
struct drm_device *dev = ring->dev; struct drm_device *dev = ring->dev;
struct drm_i915_private *dev_priv = dev->dev_private; struct drm_i915_private *dev_priv = dev->dev_private;
u32 tmp = I915_READ_CTL(ring); u32 tmp;
if (IS_GEN2(dev))
return true;
/* Is the chip hanging on a WAIT_FOR_EVENT?
* If so we can simply poke the RB_WAIT bit
* and break the hang. This should work on
* all but the second generation chipsets.
*/
tmp = I915_READ_CTL(ring);
if (tmp & RING_WAIT) { if (tmp & RING_WAIT) {
DRM_ERROR("Kicking stuck wait on %s\n", DRM_ERROR("Kicking stuck wait on %s\n",
ring->name); ring->name);
I915_WRITE_CTL(ring, tmp); I915_WRITE_CTL(ring, tmp);
return true; return false;
} }
if (INTEL_INFO(dev)->gen >= 6 && if (INTEL_INFO(dev)->gen >= 6 &&
...@@ -2380,22 +2380,10 @@ static bool kick_ring(struct intel_ring_buffer *ring) ...@@ -2380,22 +2380,10 @@ static bool kick_ring(struct intel_ring_buffer *ring)
DRM_ERROR("Kicking stuck semaphore on %s\n", DRM_ERROR("Kicking stuck semaphore on %s\n",
ring->name); ring->name);
I915_WRITE_CTL(ring, tmp); I915_WRITE_CTL(ring, tmp);
return true;
}
return false;
}
static bool i915_hangcheck_ring_hung(struct intel_ring_buffer *ring)
{
if (IS_GEN2(ring->dev))
return false; return false;
}
/* Is the chip hanging on a WAIT_FOR_EVENT? return true;
* If so we can simply poke the RB_WAIT bit
* and break the hang. This should work on
* all but the second generation chipsets.
*/
return !kick_ring(ring);
} }
/** /**
...@@ -2413,45 +2401,63 @@ void i915_hangcheck_elapsed(unsigned long data) ...@@ -2413,45 +2401,63 @@ void i915_hangcheck_elapsed(unsigned long data)
struct intel_ring_buffer *ring; struct intel_ring_buffer *ring;
int i; int i;
int busy_count = 0, rings_hung = 0; int busy_count = 0, rings_hung = 0;
bool stuck[I915_NUM_RINGS]; bool stuck[I915_NUM_RINGS] = { 0 };
#define BUSY 1
#define KICK 5
#define HUNG 20
#define FIRE 30
if (!i915_enable_hangcheck) if (!i915_enable_hangcheck)
return; return;
for_each_ring(ring, dev_priv, i) { for_each_ring(ring, dev_priv, i) {
u32 seqno, acthd; u32 seqno, acthd;
bool idle, err = false; bool busy = true;
seqno = ring->get_seqno(ring, false); seqno = ring->get_seqno(ring, false);
acthd = intel_ring_get_active_head(ring); acthd = intel_ring_get_active_head(ring);
idle = i915_hangcheck_ring_idle(ring, seqno, &err);
stuck[i] = ring->hangcheck.acthd == acthd;
if (idle) {
if (err)
ring->hangcheck.score += 2;
else
ring->hangcheck.score = 0;
} else {
busy_count++;
if (ring->hangcheck.seqno == seqno) { if (ring->hangcheck.seqno == seqno) {
ring->hangcheck.score++; if (ring_idle(ring, seqno)) {
if (waitqueue_active(&ring->irq_queue)) {
/* Kick ring if stuck*/ /* Issue a wake-up to catch stuck h/w. */
if (stuck[i]) DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
i915_hangcheck_ring_hung(ring); ring->name);
wake_up_all(&ring->irq_queue);
ring->hangcheck.score += HUNG;
} else
busy = false;
} else { } else {
ring->hangcheck.score = 0; int score;
stuck[i] = ring->hangcheck.acthd == acthd;
if (stuck[i]) {
/* Every time we kick the ring, add a
* small increment to the hangcheck
* score so that we can catch a
* batch that is repeatedly kicked.
*/
score = ring_hung(ring) ? HUNG : KICK;
} else
score = BUSY;
ring->hangcheck.score += score;
} }
} else {
/* Gradually reduce the count so that we catch DoS
* attempts across multiple batches.
*/
if (ring->hangcheck.score > 0)
ring->hangcheck.score--;
} }
ring->hangcheck.seqno = seqno; ring->hangcheck.seqno = seqno;
ring->hangcheck.acthd = acthd; ring->hangcheck.acthd = acthd;
busy_count += busy;
} }
for_each_ring(ring, dev_priv, i) { for_each_ring(ring, dev_priv, i) {
if (ring->hangcheck.score > 2) { if (ring->hangcheck.score > FIRE) {
rings_hung++; rings_hung++;
DRM_ERROR("%s: %s on %s 0x%x\n", ring->name, DRM_ERROR("%s: %s on %s 0x%x\n", ring->name,
stuck[i] ? "stuck" : "no progress", stuck[i] ? "stuck" : "no progress",
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册