提交 3fb04cb0 编写于 作者: C Chris Wilson

drm/i915/selftests: Fix up igt_reset_engine

Now that we skip a per-engine reset on an idle engine, we need to update
the selftest to take that into account. In the process, we find that we
were not stressing the per-engine reset very hard, so add those missing
active resets.

v2: Actually test i915_reset_engine() by loading it with requests.

Fixes: f6ba181a ("drm/i915: Skip an engine reset if it recovered before our preparations")
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104313Signed-off-by: NChris Wilson <chris@chris-wilson.co.uk>
Cc: Michel Thierry <michel.thierry@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171217132852.30642-3-chris@chris-wilson.co.ukReviewed-by: NMichel Thierry <michel.thierry@intel.com>
上级 151a99ec
...@@ -132,6 +132,12 @@ static int emit_recurse_batch(struct hang *h, ...@@ -132,6 +132,12 @@ static int emit_recurse_batch(struct hang *h,
*batch++ = lower_32_bits(hws_address(hws, rq)); *batch++ = lower_32_bits(hws_address(hws, rq));
*batch++ = upper_32_bits(hws_address(hws, rq)); *batch++ = upper_32_bits(hws_address(hws, rq));
*batch++ = rq->fence.seqno; *batch++ = rq->fence.seqno;
*batch++ = MI_ARB_CHECK;
memset(batch, 0, 1024);
batch += 1024 / sizeof(*batch);
*batch++ = MI_ARB_CHECK;
*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
*batch++ = lower_32_bits(vma->node.start); *batch++ = lower_32_bits(vma->node.start);
*batch++ = upper_32_bits(vma->node.start); *batch++ = upper_32_bits(vma->node.start);
...@@ -140,6 +146,12 @@ static int emit_recurse_batch(struct hang *h, ...@@ -140,6 +146,12 @@ static int emit_recurse_batch(struct hang *h,
*batch++ = 0; *batch++ = 0;
*batch++ = lower_32_bits(hws_address(hws, rq)); *batch++ = lower_32_bits(hws_address(hws, rq));
*batch++ = rq->fence.seqno; *batch++ = rq->fence.seqno;
*batch++ = MI_ARB_CHECK;
memset(batch, 0, 1024);
batch += 1024 / sizeof(*batch);
*batch++ = MI_ARB_CHECK;
*batch++ = MI_BATCH_BUFFER_START | 1 << 8; *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
*batch++ = lower_32_bits(vma->node.start); *batch++ = lower_32_bits(vma->node.start);
} else if (INTEL_GEN(i915) >= 4) { } else if (INTEL_GEN(i915) >= 4) {
...@@ -147,12 +159,24 @@ static int emit_recurse_batch(struct hang *h, ...@@ -147,12 +159,24 @@ static int emit_recurse_batch(struct hang *h,
*batch++ = 0; *batch++ = 0;
*batch++ = lower_32_bits(hws_address(hws, rq)); *batch++ = lower_32_bits(hws_address(hws, rq));
*batch++ = rq->fence.seqno; *batch++ = rq->fence.seqno;
*batch++ = MI_ARB_CHECK;
memset(batch, 0, 1024);
batch += 1024 / sizeof(*batch);
*batch++ = MI_ARB_CHECK;
*batch++ = MI_BATCH_BUFFER_START | 2 << 6; *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
*batch++ = lower_32_bits(vma->node.start); *batch++ = lower_32_bits(vma->node.start);
} else { } else {
*batch++ = MI_STORE_DWORD_IMM; *batch++ = MI_STORE_DWORD_IMM;
*batch++ = lower_32_bits(hws_address(hws, rq)); *batch++ = lower_32_bits(hws_address(hws, rq));
*batch++ = rq->fence.seqno; *batch++ = rq->fence.seqno;
*batch++ = MI_ARB_CHECK;
memset(batch, 0, 1024);
batch += 1024 / sizeof(*batch);
*batch++ = MI_ARB_CHECK;
*batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1; *batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1;
*batch++ = lower_32_bits(vma->node.start); *batch++ = lower_32_bits(vma->node.start);
} }
...@@ -234,6 +258,16 @@ static void hang_fini(struct hang *h) ...@@ -234,6 +258,16 @@ static void hang_fini(struct hang *h)
i915_gem_wait_for_idle(h->i915, I915_WAIT_LOCKED); i915_gem_wait_for_idle(h->i915, I915_WAIT_LOCKED);
} }
static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
{
return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
rq->fence.seqno),
10) &&
wait_for(i915_seqno_passed(hws_seqno(h, rq),
rq->fence.seqno),
1000));
}
static int igt_hang_sanitycheck(void *arg) static int igt_hang_sanitycheck(void *arg)
{ {
struct drm_i915_private *i915 = arg; struct drm_i915_private *i915 = arg;
...@@ -296,6 +330,9 @@ static void global_reset_lock(struct drm_i915_private *i915) ...@@ -296,6 +330,9 @@ static void global_reset_lock(struct drm_i915_private *i915)
struct intel_engine_cs *engine; struct intel_engine_cs *engine;
enum intel_engine_id id; enum intel_engine_id id;
pr_debug("%s: current gpu_error=%08lx\n",
__func__, i915->gpu_error.flags);
while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags)) while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
wait_event(i915->gpu_error.reset_queue, wait_event(i915->gpu_error.reset_queue,
!test_bit(I915_RESET_BACKOFF, !test_bit(I915_RESET_BACKOFF,
...@@ -353,54 +390,128 @@ static int igt_global_reset(void *arg) ...@@ -353,54 +390,128 @@ static int igt_global_reset(void *arg)
return err; return err;
} }
static int igt_reset_engine(void *arg) static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
{ {
struct drm_i915_private *i915 = arg;
struct intel_engine_cs *engine; struct intel_engine_cs *engine;
enum intel_engine_id id; enum intel_engine_id id;
unsigned int reset_count, reset_engine_count; struct hang h;
int err = 0; int err = 0;
/* Check that we can issue a global GPU and engine reset */ /* Check that we can issue an engine reset on an idle engine (no-op) */
if (!intel_has_reset_engine(i915)) if (!intel_has_reset_engine(i915))
return 0; return 0;
if (active) {
mutex_lock(&i915->drm.struct_mutex);
err = hang_init(&h, i915);
mutex_unlock(&i915->drm.struct_mutex);
if (err)
return err;
}
for_each_engine(engine, i915, id) { for_each_engine(engine, i915, id) {
set_bit(I915_RESET_ENGINE + engine->id, &i915->gpu_error.flags); unsigned int reset_count, reset_engine_count;
IGT_TIMEOUT(end_time);
if (active && !intel_engine_can_store_dword(engine))
continue;
reset_count = i915_reset_count(&i915->gpu_error); reset_count = i915_reset_count(&i915->gpu_error);
reset_engine_count = i915_reset_engine_count(&i915->gpu_error, reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
engine); engine);
err = i915_reset_engine(engine, I915_RESET_QUIET); set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
if (err) { do {
pr_err("i915_reset_engine failed\n"); if (active) {
break; struct drm_i915_gem_request *rq;
}
mutex_lock(&i915->drm.struct_mutex);
rq = hang_create_request(&h, engine,
i915->kernel_context);
if (IS_ERR(rq)) {
err = PTR_ERR(rq);
mutex_unlock(&i915->drm.struct_mutex);
break;
}
i915_gem_request_get(rq);
__i915_add_request(rq, true);
mutex_unlock(&i915->drm.struct_mutex);
if (!wait_for_hang(&h, rq)) {
struct drm_printer p = drm_info_printer(i915->drm.dev);
pr_err("%s: Failed to start request %x, at %x\n",
__func__, rq->fence.seqno, hws_seqno(&h, rq));
intel_engine_dump(engine, &p,
"%s\n", engine->name);
i915_gem_request_put(rq);
err = -EIO;
break;
}
if (i915_reset_count(&i915->gpu_error) != reset_count) { i915_gem_request_put(rq);
pr_err("Full GPU reset recorded! (engine reset expected)\n"); }
err = -EINVAL;
break; engine->hangcheck.stalled = true;
} engine->hangcheck.seqno =
intel_engine_get_seqno(engine);
err = i915_reset_engine(engine, I915_RESET_QUIET);
if (err) {
pr_err("i915_reset_engine failed\n");
break;
}
if (i915_reset_count(&i915->gpu_error) != reset_count) {
pr_err("Full GPU reset recorded! (engine reset expected)\n");
err = -EINVAL;
break;
}
reset_engine_count += active;
if (i915_reset_engine_count(&i915->gpu_error, engine) !=
reset_engine_count) {
pr_err("%s engine reset %srecorded!\n",
engine->name, active ? "not " : "");
err = -EINVAL;
break;
}
engine->hangcheck.stalled = false;
} while (time_before(jiffies, end_time));
clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
if (i915_reset_engine_count(&i915->gpu_error, engine) == if (err)
reset_engine_count) {
pr_err("No %s engine reset recorded!\n", engine->name);
err = -EINVAL;
break; break;
}
clear_bit(I915_RESET_ENGINE + engine->id, cond_resched();
&i915->gpu_error.flags);
} }
if (i915_terminally_wedged(&i915->gpu_error)) if (i915_terminally_wedged(&i915->gpu_error))
err = -EIO; err = -EIO;
if (active) {
mutex_lock(&i915->drm.struct_mutex);
hang_fini(&h);
mutex_unlock(&i915->drm.struct_mutex);
}
return err; return err;
} }
static int igt_reset_idle_engine(void *arg)
{
return __igt_reset_engine(arg, false);
}
static int igt_reset_active_engine(void *arg)
{
return __igt_reset_engine(arg, true);
}
static int active_engine(void *data) static int active_engine(void *data)
{ {
struct intel_engine_cs *engine = data; struct intel_engine_cs *engine = data;
...@@ -462,11 +573,12 @@ static int active_engine(void *data) ...@@ -462,11 +573,12 @@ static int active_engine(void *data)
return err; return err;
} }
static int igt_reset_active_engines(void *arg) static int __igt_reset_engine_others(struct drm_i915_private *i915,
bool active)
{ {
struct drm_i915_private *i915 = arg; struct intel_engine_cs *engine, *other;
struct intel_engine_cs *engine, *active;
enum intel_engine_id id, tmp; enum intel_engine_id id, tmp;
struct hang h;
int err = 0; int err = 0;
/* Check that issuing a reset on one engine does not interfere /* Check that issuing a reset on one engine does not interfere
...@@ -476,24 +588,36 @@ static int igt_reset_active_engines(void *arg) ...@@ -476,24 +588,36 @@ static int igt_reset_active_engines(void *arg)
if (!intel_has_reset_engine(i915)) if (!intel_has_reset_engine(i915))
return 0; return 0;
if (active) {
mutex_lock(&i915->drm.struct_mutex);
err = hang_init(&h, i915);
mutex_unlock(&i915->drm.struct_mutex);
if (err)
return err;
}
for_each_engine(engine, i915, id) { for_each_engine(engine, i915, id) {
struct task_struct *threads[I915_NUM_ENGINES]; struct task_struct *threads[I915_NUM_ENGINES] = {};
unsigned long resets[I915_NUM_ENGINES]; unsigned long resets[I915_NUM_ENGINES];
unsigned long global = i915_reset_count(&i915->gpu_error); unsigned long global = i915_reset_count(&i915->gpu_error);
unsigned long count = 0;
IGT_TIMEOUT(end_time); IGT_TIMEOUT(end_time);
if (active && !intel_engine_can_store_dword(engine))
continue;
memset(threads, 0, sizeof(threads)); memset(threads, 0, sizeof(threads));
for_each_engine(active, i915, tmp) { for_each_engine(other, i915, tmp) {
struct task_struct *tsk; struct task_struct *tsk;
if (active == engine)
continue;
resets[tmp] = i915_reset_engine_count(&i915->gpu_error, resets[tmp] = i915_reset_engine_count(&i915->gpu_error,
active); other);
tsk = kthread_run(active_engine, active, if (other == engine)
"igt/%s", active->name); continue;
tsk = kthread_run(active_engine, other,
"igt/%s", other->name);
if (IS_ERR(tsk)) { if (IS_ERR(tsk)) {
err = PTR_ERR(tsk); err = PTR_ERR(tsk);
goto unwind; goto unwind;
...@@ -503,20 +627,70 @@ static int igt_reset_active_engines(void *arg) ...@@ -503,20 +627,70 @@ static int igt_reset_active_engines(void *arg)
get_task_struct(tsk); get_task_struct(tsk);
} }
set_bit(I915_RESET_ENGINE + engine->id, &i915->gpu_error.flags); set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
do { do {
if (active) {
struct drm_i915_gem_request *rq;
mutex_lock(&i915->drm.struct_mutex);
rq = hang_create_request(&h, engine,
i915->kernel_context);
if (IS_ERR(rq)) {
err = PTR_ERR(rq);
mutex_unlock(&i915->drm.struct_mutex);
break;
}
i915_gem_request_get(rq);
__i915_add_request(rq, true);
mutex_unlock(&i915->drm.struct_mutex);
if (!wait_for_hang(&h, rq)) {
struct drm_printer p = drm_info_printer(i915->drm.dev);
pr_err("%s: Failed to start request %x, at %x\n",
__func__, rq->fence.seqno, hws_seqno(&h, rq));
intel_engine_dump(engine, &p,
"%s\n", engine->name);
i915_gem_request_put(rq);
err = -EIO;
break;
}
i915_gem_request_put(rq);
}
engine->hangcheck.stalled = true;
engine->hangcheck.seqno =
intel_engine_get_seqno(engine);
err = i915_reset_engine(engine, I915_RESET_QUIET); err = i915_reset_engine(engine, I915_RESET_QUIET);
if (err) { if (err) {
pr_err("i915_reset_engine(%s) failed, err=%d\n", pr_err("i915_reset_engine(%s:%s) failed, err=%d\n",
engine->name, err); engine->name, active ? "active" : "idle", err);
break; break;
} }
engine->hangcheck.stalled = false;
count++;
} while (time_before(jiffies, end_time)); } while (time_before(jiffies, end_time));
clear_bit(I915_RESET_ENGINE + engine->id, clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
&i915->gpu_error.flags); pr_info("i915_reset_engine(%s:%s): %lu resets\n",
engine->name, active ? "active" : "idle", count);
if (i915_reset_engine_count(&i915->gpu_error, engine) -
resets[engine->id] != (active ? count : 0)) {
pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
engine->name, active ? "active" : "idle", count,
i915_reset_engine_count(&i915->gpu_error,
engine) - resets[engine->id]);
if (!err)
err = -EINVAL;
}
unwind: unwind:
for_each_engine(active, i915, tmp) { for_each_engine(other, i915, tmp) {
int ret; int ret;
if (!threads[tmp]) if (!threads[tmp])
...@@ -524,27 +698,29 @@ static int igt_reset_active_engines(void *arg) ...@@ -524,27 +698,29 @@ static int igt_reset_active_engines(void *arg)
ret = kthread_stop(threads[tmp]); ret = kthread_stop(threads[tmp]);
if (ret) { if (ret) {
pr_err("kthread for active engine %s failed, err=%d\n", pr_err("kthread for other engine %s failed, err=%d\n",
active->name, ret); other->name, ret);
if (!err) if (!err)
err = ret; err = ret;
} }
put_task_struct(threads[tmp]); put_task_struct(threads[tmp]);
if (resets[tmp] != i915_reset_engine_count(&i915->gpu_error, if (resets[tmp] != i915_reset_engine_count(&i915->gpu_error,
active)) { other)) {
pr_err("Innocent engine %s was reset (count=%ld)\n", pr_err("Innocent engine %s was reset (count=%ld)\n",
active->name, other->name,
i915_reset_engine_count(&i915->gpu_error, i915_reset_engine_count(&i915->gpu_error,
active) - resets[tmp]); other) - resets[tmp]);
err = -EIO; if (!err)
err = -EINVAL;
} }
} }
if (global != i915_reset_count(&i915->gpu_error)) { if (global != i915_reset_count(&i915->gpu_error)) {
pr_err("Global reset (count=%ld)!\n", pr_err("Global reset (count=%ld)!\n",
i915_reset_count(&i915->gpu_error) - global); i915_reset_count(&i915->gpu_error) - global);
err = -EIO; if (!err)
err = -EINVAL;
} }
if (err) if (err)
...@@ -556,9 +732,25 @@ static int igt_reset_active_engines(void *arg) ...@@ -556,9 +732,25 @@ static int igt_reset_active_engines(void *arg)
if (i915_terminally_wedged(&i915->gpu_error)) if (i915_terminally_wedged(&i915->gpu_error))
err = -EIO; err = -EIO;
if (active) {
mutex_lock(&i915->drm.struct_mutex);
hang_fini(&h);
mutex_unlock(&i915->drm.struct_mutex);
}
return err; return err;
} }
static int igt_reset_idle_engine_others(void *arg)
{
return __igt_reset_engine_others(arg, false);
}
static int igt_reset_active_engine_others(void *arg)
{
return __igt_reset_engine_others(arg, true);
}
static u32 fake_hangcheck(struct drm_i915_gem_request *rq) static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
{ {
u32 reset_count; u32 reset_count;
...@@ -574,16 +766,6 @@ static u32 fake_hangcheck(struct drm_i915_gem_request *rq) ...@@ -574,16 +766,6 @@ static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
return reset_count; return reset_count;
} }
static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
{
return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
rq->fence.seqno),
10) &&
wait_for(i915_seqno_passed(hws_seqno(h, rq),
rq->fence.seqno),
1000));
}
static int igt_wait_reset(void *arg) static int igt_wait_reset(void *arg)
{ {
struct drm_i915_private *i915 = arg; struct drm_i915_private *i915 = arg;
...@@ -617,8 +799,8 @@ static int igt_wait_reset(void *arg) ...@@ -617,8 +799,8 @@ static int igt_wait_reset(void *arg)
if (!wait_for_hang(&h, rq)) { if (!wait_for_hang(&h, rq)) {
struct drm_printer p = drm_info_printer(i915->drm.dev); struct drm_printer p = drm_info_printer(i915->drm.dev);
pr_err("Failed to start request %x, at %x\n", pr_err("%s: Failed to start request %x, at %x\n",
rq->fence.seqno, hws_seqno(&h, rq)); __func__, rq->fence.seqno, hws_seqno(&h, rq));
intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
i915_reset(i915, 0); i915_reset(i915, 0);
...@@ -712,8 +894,8 @@ static int igt_reset_queue(void *arg) ...@@ -712,8 +894,8 @@ static int igt_reset_queue(void *arg)
if (!wait_for_hang(&h, prev)) { if (!wait_for_hang(&h, prev)) {
struct drm_printer p = drm_info_printer(i915->drm.dev); struct drm_printer p = drm_info_printer(i915->drm.dev);
pr_err("Failed to start request %x, at %x\n", pr_err("%s: Failed to start request %x, at %x\n",
prev->fence.seqno, hws_seqno(&h, prev)); __func__, prev->fence.seqno, hws_seqno(&h, prev));
intel_engine_dump(prev->engine, &p, intel_engine_dump(prev->engine, &p,
"%s\n", prev->engine->name); "%s\n", prev->engine->name);
...@@ -819,8 +1001,8 @@ static int igt_handle_error(void *arg) ...@@ -819,8 +1001,8 @@ static int igt_handle_error(void *arg)
if (!wait_for_hang(&h, rq)) { if (!wait_for_hang(&h, rq)) {
struct drm_printer p = drm_info_printer(i915->drm.dev); struct drm_printer p = drm_info_printer(i915->drm.dev);
pr_err("Failed to start request %x, at %x\n", pr_err("%s: Failed to start request %x, at %x\n",
rq->fence.seqno, hws_seqno(&h, rq)); __func__, rq->fence.seqno, hws_seqno(&h, rq));
intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
i915_reset(i915, 0); i915_reset(i915, 0);
...@@ -864,21 +1046,26 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915) ...@@ -864,21 +1046,26 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
static const struct i915_subtest tests[] = { static const struct i915_subtest tests[] = {
SUBTEST(igt_global_reset), /* attempt to recover GPU first */ SUBTEST(igt_global_reset), /* attempt to recover GPU first */
SUBTEST(igt_hang_sanitycheck), SUBTEST(igt_hang_sanitycheck),
SUBTEST(igt_reset_engine), SUBTEST(igt_reset_idle_engine),
SUBTEST(igt_reset_active_engines), SUBTEST(igt_reset_active_engine),
SUBTEST(igt_reset_idle_engine_others),
SUBTEST(igt_reset_active_engine_others),
SUBTEST(igt_wait_reset), SUBTEST(igt_wait_reset),
SUBTEST(igt_reset_queue), SUBTEST(igt_reset_queue),
SUBTEST(igt_handle_error), SUBTEST(igt_handle_error),
}; };
bool saved_hangcheck;
int err; int err;
if (!intel_has_gpu_reset(i915)) if (!intel_has_gpu_reset(i915))
return 0; return 0;
intel_runtime_pm_get(i915); intel_runtime_pm_get(i915);
saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
err = i915_subtests(tests, i915); err = i915_subtests(tests, i915);
i915_modparams.enable_hangcheck = saved_hangcheck;
intel_runtime_pm_put(i915); intel_runtime_pm_put(i915);
return err; return err;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册