intel_hangcheck.c 29.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
/*
 * Copyright © 2016 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 *
 */

25 26
#include <linux/kthread.h>

27
#include "../i915_selftest.h"
28
#include "i915_random.h"
29

30 31 32
#include "mock_context.h"
#include "mock_drm.h"

33 34
#define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */

35 36 37 38
struct hang {
	struct drm_i915_private *i915;
	struct drm_i915_gem_object *hws;
	struct drm_i915_gem_object *obj;
39
	struct i915_gem_context *ctx;
40 41 42 43 44 45 46 47 48 49 50 51
	u32 *seqno;
	u32 *batch;
};

static int hang_init(struct hang *h, struct drm_i915_private *i915)
{
	void *vaddr;
	int err;

	memset(h, 0, sizeof(*h));
	h->i915 = i915;

52 53 54 55
	h->ctx = kernel_context(i915);
	if (IS_ERR(h->ctx))
		return PTR_ERR(h->ctx);

56
	h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
57 58 59 60
	if (IS_ERR(h->hws)) {
		err = PTR_ERR(h->hws);
		goto err_ctx;
	}
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91

	h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
	if (IS_ERR(h->obj)) {
		err = PTR_ERR(h->obj);
		goto err_hws;
	}

	i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC);
	vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
	if (IS_ERR(vaddr)) {
		err = PTR_ERR(vaddr);
		goto err_obj;
	}
	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);

	vaddr = i915_gem_object_pin_map(h->obj,
					HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC);
	if (IS_ERR(vaddr)) {
		err = PTR_ERR(vaddr);
		goto err_unpin_hws;
	}
	h->batch = vaddr;

	return 0;

err_unpin_hws:
	i915_gem_object_unpin_map(h->hws);
err_obj:
	i915_gem_object_put(h->obj);
err_hws:
	i915_gem_object_put(h->hws);
92 93
err_ctx:
	kernel_context_close(h->ctx);
94 95 96 97
	return err;
}

static u64 hws_address(const struct i915_vma *hws,
98
		       const struct i915_request *rq)
99 100 101 102 103
{
	return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
}

static int emit_recurse_batch(struct hang *h,
104
			      struct i915_request *rq)
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
{
	struct drm_i915_private *i915 = h->i915;
	struct i915_address_space *vm = rq->ctx->ppgtt ? &rq->ctx->ppgtt->base : &i915->ggtt.base;
	struct i915_vma *hws, *vma;
	unsigned int flags;
	u32 *batch;
	int err;

	vma = i915_vma_instance(h->obj, vm, NULL);
	if (IS_ERR(vma))
		return PTR_ERR(vma);

	hws = i915_vma_instance(h->hws, vm, NULL);
	if (IS_ERR(hws))
		return PTR_ERR(hws);

	err = i915_vma_pin(vma, 0, 0, PIN_USER);
	if (err)
		return err;

	err = i915_vma_pin(hws, 0, 0, PIN_USER);
	if (err)
		goto unpin_vma;

	i915_vma_move_to_active(vma, rq, 0);
	if (!i915_gem_object_has_active_reference(vma->obj)) {
		i915_gem_object_get(vma->obj);
		i915_gem_object_set_active_reference(vma->obj);
	}

	i915_vma_move_to_active(hws, rq, 0);
	if (!i915_gem_object_has_active_reference(hws->obj)) {
		i915_gem_object_get(hws->obj);
		i915_gem_object_set_active_reference(hws->obj);
	}

	batch = h->batch;
	if (INTEL_GEN(i915) >= 8) {
		*batch++ = MI_STORE_DWORD_IMM_GEN4;
		*batch++ = lower_32_bits(hws_address(hws, rq));
		*batch++ = upper_32_bits(hws_address(hws, rq));
		*batch++ = rq->fence.seqno;
147 148 149 150 151 152
		*batch++ = MI_ARB_CHECK;

		memset(batch, 0, 1024);
		batch += 1024 / sizeof(*batch);

		*batch++ = MI_ARB_CHECK;
153 154 155 156 157 158 159 160
		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
		*batch++ = lower_32_bits(vma->node.start);
		*batch++ = upper_32_bits(vma->node.start);
	} else if (INTEL_GEN(i915) >= 6) {
		*batch++ = MI_STORE_DWORD_IMM_GEN4;
		*batch++ = 0;
		*batch++ = lower_32_bits(hws_address(hws, rq));
		*batch++ = rq->fence.seqno;
161 162 163 164 165 166
		*batch++ = MI_ARB_CHECK;

		memset(batch, 0, 1024);
		batch += 1024 / sizeof(*batch);

		*batch++ = MI_ARB_CHECK;
167 168 169 170 171 172 173
		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
		*batch++ = lower_32_bits(vma->node.start);
	} else if (INTEL_GEN(i915) >= 4) {
		*batch++ = MI_STORE_DWORD_IMM_GEN4 | 1 << 22;
		*batch++ = 0;
		*batch++ = lower_32_bits(hws_address(hws, rq));
		*batch++ = rq->fence.seqno;
174 175 176 177 178 179
		*batch++ = MI_ARB_CHECK;

		memset(batch, 0, 1024);
		batch += 1024 / sizeof(*batch);

		*batch++ = MI_ARB_CHECK;
180 181 182 183 184 185
		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
		*batch++ = lower_32_bits(vma->node.start);
	} else {
		*batch++ = MI_STORE_DWORD_IMM;
		*batch++ = lower_32_bits(hws_address(hws, rq));
		*batch++ = rq->fence.seqno;
186 187 188 189 190 191
		*batch++ = MI_ARB_CHECK;

		memset(batch, 0, 1024);
		batch += 1024 / sizeof(*batch);

		*batch++ = MI_ARB_CHECK;
192 193 194 195
		*batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1;
		*batch++ = lower_32_bits(vma->node.start);
	}
	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
196
	i915_gem_chipset_flush(h->i915);
197 198 199 200 201 202 203 204 205 206 207 208 209

	flags = 0;
	if (INTEL_GEN(vm->i915) <= 5)
		flags |= I915_DISPATCH_SECURE;

	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);

	i915_vma_unpin(hws);
unpin_vma:
	i915_vma_unpin(vma);
	return err;
}

210
static struct i915_request *
211
hang_create_request(struct hang *h, struct intel_engine_cs *engine)
212
{
213
	struct i915_request *rq;
214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
	int err;

	if (i915_gem_object_is_active(h->obj)) {
		struct drm_i915_gem_object *obj;
		void *vaddr;

		obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
		if (IS_ERR(obj))
			return ERR_CAST(obj);

		vaddr = i915_gem_object_pin_map(obj,
						HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC);
		if (IS_ERR(vaddr)) {
			i915_gem_object_put(obj);
			return ERR_CAST(vaddr);
		}

		i915_gem_object_unpin_map(h->obj);
		i915_gem_object_put(h->obj);

		h->obj = obj;
		h->batch = vaddr;
	}

238
	rq = i915_request_alloc(engine, h->ctx);
239 240 241 242 243
	if (IS_ERR(rq))
		return rq;

	err = emit_recurse_batch(h, rq);
	if (err) {
244
		__i915_request_add(rq, false);
245 246 247 248 249 250
		return ERR_PTR(err);
	}

	return rq;
}

251
static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
252 253 254 255
{
	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
}

256 257 258 259 260 261 262 263 264 265
struct wedge_me {
	struct delayed_work work;
	struct drm_i915_private *i915;
	const void *symbol;
};

static void wedge_me(struct work_struct *work)
{
	struct wedge_me *w = container_of(work, typeof(*w), work.work);

266 267 268 269 270
	pr_err("%pS timed out, cancelling all further testing.\n", w->symbol);

	GEM_TRACE("%pS timed out.\n", w->symbol);
	GEM_TRACE_DUMP();

271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310
	i915_gem_set_wedged(w->i915);
}

static void __init_wedge(struct wedge_me *w,
			 struct drm_i915_private *i915,
			 long timeout,
			 const void *symbol)
{
	w->i915 = i915;
	w->symbol = symbol;

	INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
	schedule_delayed_work(&w->work, timeout);
}

static void __fini_wedge(struct wedge_me *w)
{
	cancel_delayed_work_sync(&w->work);
	destroy_delayed_work_on_stack(&w->work);
	w->i915 = NULL;
}

#define wedge_on_timeout(W, DEV, TIMEOUT)				\
	for (__init_wedge((W), (DEV), (TIMEOUT), __builtin_return_address(0)); \
	     (W)->i915;							\
	     __fini_wedge((W)))

static noinline int
flush_test(struct drm_i915_private *i915, unsigned int flags)
{
	struct wedge_me w;

	cond_resched();

	wedge_on_timeout(&w, i915, HZ)
		i915_gem_wait_for_idle(i915, flags);

	return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
}

311 312 313
static void hang_fini(struct hang *h)
{
	*h->batch = MI_BATCH_BUFFER_END;
314
	i915_gem_chipset_flush(h->i915);
315 316 317 318 319 320 321

	i915_gem_object_unpin_map(h->obj);
	i915_gem_object_put(h->obj);

	i915_gem_object_unpin_map(h->hws);
	i915_gem_object_put(h->hws);

322 323
	kernel_context_close(h->ctx);

324
	flush_test(h->i915, I915_WAIT_LOCKED);
325 326
}

327
static bool wait_until_running(struct hang *h, struct i915_request *rq)
328 329 330 331 332 333 334 335 336
{
	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
					       rq->fence.seqno),
			     10) &&
		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
					    rq->fence.seqno),
			  1000));
}

337 338 339
static int igt_hang_sanitycheck(void *arg)
{
	struct drm_i915_private *i915 = arg;
340
	struct i915_request *rq;
341 342 343 344 345 346 347 348 349 350 351 352 353 354 355
	struct intel_engine_cs *engine;
	enum intel_engine_id id;
	struct hang h;
	int err;

	/* Basic check that we can execute our hanging batch */

	mutex_lock(&i915->drm.struct_mutex);
	err = hang_init(&h, i915);
	if (err)
		goto unlock;

	for_each_engine(engine, i915, id) {
		long timeout;

356 357 358
		if (!intel_engine_can_store_dword(engine))
			continue;

359
		rq = hang_create_request(&h, engine);
360 361 362 363 364 365 366
		if (IS_ERR(rq)) {
			err = PTR_ERR(rq);
			pr_err("Failed to create request for %s, err=%d\n",
			       engine->name, err);
			goto fini;
		}

367
		i915_request_get(rq);
368 369

		*h.batch = MI_BATCH_BUFFER_END;
370 371
		i915_gem_chipset_flush(i915);

372
		__i915_request_add(rq, true);
373

374
		timeout = i915_request_wait(rq,
375 376
					    I915_WAIT_LOCKED,
					    MAX_SCHEDULE_TIMEOUT);
377
		i915_request_put(rq);
378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393

		if (timeout < 0) {
			err = timeout;
			pr_err("Wait for request failed on %s, err=%d\n",
			       engine->name, err);
			goto fini;
		}
	}

fini:
	hang_fini(&h);
unlock:
	mutex_unlock(&i915->drm.struct_mutex);
	return err;
}

394 395 396 397 398
static void global_reset_lock(struct drm_i915_private *i915)
{
	struct intel_engine_cs *engine;
	enum intel_engine_id id;

399 400 401
	pr_debug("%s: current gpu_error=%08lx\n",
		 __func__, i915->gpu_error.flags);

402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427
	while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
		wait_event(i915->gpu_error.reset_queue,
			   !test_bit(I915_RESET_BACKOFF,
				     &i915->gpu_error.flags));

	for_each_engine(engine, i915, id) {
		while (test_and_set_bit(I915_RESET_ENGINE + id,
					&i915->gpu_error.flags))
			wait_on_bit(&i915->gpu_error.flags,
				    I915_RESET_ENGINE + id,
				    TASK_UNINTERRUPTIBLE);
	}
}

static void global_reset_unlock(struct drm_i915_private *i915)
{
	struct intel_engine_cs *engine;
	enum intel_engine_id id;

	for_each_engine(engine, i915, id)
		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);

	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
	wake_up_all(&i915->gpu_error.reset_queue);
}

428 429 430 431 432 433 434 435
static int igt_global_reset(void *arg)
{
	struct drm_i915_private *i915 = arg;
	unsigned int reset_count;
	int err = 0;

	/* Check that we can issue a global GPU reset */

436
	global_reset_lock(i915);
437
	set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
438 439 440 441

	mutex_lock(&i915->drm.struct_mutex);
	reset_count = i915_reset_count(&i915->gpu_error);

442
	i915_reset(i915, ALL_ENGINES, NULL);
443 444 445 446 447 448 449

	if (i915_reset_count(&i915->gpu_error) == reset_count) {
		pr_err("No GPU reset recorded!\n");
		err = -EINVAL;
	}
	mutex_unlock(&i915->drm.struct_mutex);

450
	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
451
	global_reset_unlock(i915);
452

453 454 455 456 457 458
	if (i915_terminally_wedged(&i915->gpu_error))
		err = -EIO;

	return err;
}

459 460 461 462 463
static bool wait_for_idle(struct intel_engine_cs *engine)
{
	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
}

464
static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
465 466 467
{
	struct intel_engine_cs *engine;
	enum intel_engine_id id;
468
	struct hang h;
469 470
	int err = 0;

471
	/* Check that we can issue an engine reset on an idle engine (no-op) */
472 473 474 475

	if (!intel_has_reset_engine(i915))
		return 0;

476 477 478 479 480 481 482 483
	if (active) {
		mutex_lock(&i915->drm.struct_mutex);
		err = hang_init(&h, i915);
		mutex_unlock(&i915->drm.struct_mutex);
		if (err)
			return err;
	}

484
	for_each_engine(engine, i915, id) {
485 486 487 488 489 490
		unsigned int reset_count, reset_engine_count;
		IGT_TIMEOUT(end_time);

		if (active && !intel_engine_can_store_dword(engine))
			continue;

491 492 493 494 495 496 497
		if (!wait_for_idle(engine)) {
			pr_err("%s failed to idle before reset\n",
			       engine->name);
			err = -EIO;
			break;
		}

498 499 500 501
		reset_count = i915_reset_count(&i915->gpu_error);
		reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
							     engine);

502 503
		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
		do {
504 505
			u32 seqno = intel_engine_get_seqno(engine);

506
			if (active) {
507
				struct i915_request *rq;
508 509

				mutex_lock(&i915->drm.struct_mutex);
510
				rq = hang_create_request(&h, engine);
511 512 513 514 515 516
				if (IS_ERR(rq)) {
					err = PTR_ERR(rq);
					mutex_unlock(&i915->drm.struct_mutex);
					break;
				}

517 518
				i915_request_get(rq);
				__i915_request_add(rq, true);
519 520
				mutex_unlock(&i915->drm.struct_mutex);

521
				if (!wait_until_running(&h, rq)) {
522 523 524 525 526 527 528
					struct drm_printer p = drm_info_printer(i915->drm.dev);

					pr_err("%s: Failed to start request %x, at %x\n",
					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
					intel_engine_dump(engine, &p,
							  "%s\n", engine->name);

529
					i915_request_put(rq);
530 531 532
					err = -EIO;
					break;
				}
533

534 535
				GEM_BUG_ON(!rq->global_seqno);
				seqno = rq->global_seqno - 1;
536
				i915_request_put(rq);
537 538
			}

539
			err = i915_reset_engine(engine, NULL);
540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558
			if (err) {
				pr_err("i915_reset_engine failed\n");
				break;
			}

			if (i915_reset_count(&i915->gpu_error) != reset_count) {
				pr_err("Full GPU reset recorded! (engine reset expected)\n");
				err = -EINVAL;
				break;
			}

			reset_engine_count += active;
			if (i915_reset_engine_count(&i915->gpu_error, engine) !=
			    reset_engine_count) {
				pr_err("%s engine reset %srecorded!\n",
				       engine->name, active ? "not " : "");
				err = -EINVAL;
				break;
			}
559 560 561 562 563 564 565 566 567 568 569 570 571

			if (!wait_for_idle(engine)) {
				struct drm_printer p =
					drm_info_printer(i915->drm.dev);

				pr_err("%s failed to idle after reset\n",
				       engine->name);
				intel_engine_dump(engine, &p,
						  "%s\n", engine->name);

				err = -EIO;
				break;
			}
572 573
		} while (time_before(jiffies, end_time));
		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
574

575
		if (err)
576 577
			break;

578 579 580
		err = flush_test(i915, 0);
		if (err)
			break;
581 582 583 584 585
	}

	if (i915_terminally_wedged(&i915->gpu_error))
		err = -EIO;

586 587 588 589 590 591
	if (active) {
		mutex_lock(&i915->drm.struct_mutex);
		hang_fini(&h);
		mutex_unlock(&i915->drm.struct_mutex);
	}

592 593 594
	return err;
}

595 596 597 598 599 600 601 602 603 604
static int igt_reset_idle_engine(void *arg)
{
	return __igt_reset_engine(arg, false);
}

static int igt_reset_active_engine(void *arg)
{
	return __igt_reset_engine(arg, true);
}

605 606 607 608 609 610 611 612 613 614 615 616
struct active_engine {
	struct task_struct *task;
	struct intel_engine_cs *engine;
	unsigned long resets;
	unsigned int flags;
};

#define TEST_ACTIVE	BIT(0)
#define TEST_OTHERS	BIT(1)
#define TEST_SELF	BIT(2)
#define TEST_PRIORITY	BIT(3)

617 618
static int active_engine(void *data)
{
619 620 621 622 623
	I915_RND_STATE(prng);
	struct active_engine *arg = data;
	struct intel_engine_cs *engine = arg->engine;
	struct i915_request *rq[8] = {};
	struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
624 625 626 627 628 629 630 631
	struct drm_file *file;
	unsigned long count = 0;
	int err = 0;

	file = mock_file(engine->i915);
	if (IS_ERR(file))
		return PTR_ERR(file);

632 633 634 635 636 637 638 639 640 641
	for (count = 0; count < ARRAY_SIZE(ctx); count++) {
		mutex_lock(&engine->i915->drm.struct_mutex);
		ctx[count] = live_context(engine->i915, file);
		mutex_unlock(&engine->i915->drm.struct_mutex);
		if (IS_ERR(ctx[count])) {
			err = PTR_ERR(ctx[count]);
			while (--count)
				i915_gem_context_put(ctx[count]);
			goto err_file;
		}
642 643 644
	}

	while (!kthread_should_stop()) {
645
		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
646 647
		struct i915_request *old = rq[idx];
		struct i915_request *new;
648 649

		mutex_lock(&engine->i915->drm.struct_mutex);
650
		new = i915_request_alloc(engine, ctx[idx]);
651 652 653 654 655 656
		if (IS_ERR(new)) {
			mutex_unlock(&engine->i915->drm.struct_mutex);
			err = PTR_ERR(new);
			break;
		}

657
		if (arg->flags & TEST_PRIORITY)
658
			ctx[idx]->sched.priority =
659 660
				i915_prandom_u32_max_state(512, &prng);

661 662
		rq[idx] = i915_request_get(new);
		i915_request_add(new);
663 664 665
		mutex_unlock(&engine->i915->drm.struct_mutex);

		if (old) {
666 667 668 669 670 671 672 673 674
			if (i915_request_wait(old, 0, HZ) < 0) {
				GEM_TRACE("%s timed out.\n", engine->name);
				GEM_TRACE_DUMP();

				i915_gem_set_wedged(engine->i915);
				i915_request_put(old);
				err = -EIO;
				break;
			}
675
			i915_request_put(old);
676
		}
677 678

		cond_resched();
679 680 681
	}

	for (count = 0; count < ARRAY_SIZE(rq); count++)
682
		i915_request_put(rq[count]);
683 684 685 686 687 688

err_file:
	mock_file_free(engine->i915, file);
	return err;
}

689 690 691
static int __igt_reset_engines(struct drm_i915_private *i915,
			       const char *test_name,
			       unsigned int flags)
692
{
693
	struct intel_engine_cs *engine, *other;
694
	enum intel_engine_id id, tmp;
695
	struct hang h;
696 697 698 699 700 701 702 703 704
	int err = 0;

	/* Check that issuing a reset on one engine does not interfere
	 * with any other engine.
	 */

	if (!intel_has_reset_engine(i915))
		return 0;

705
	if (flags & TEST_ACTIVE) {
706 707 708 709 710
		mutex_lock(&i915->drm.struct_mutex);
		err = hang_init(&h, i915);
		mutex_unlock(&i915->drm.struct_mutex);
		if (err)
			return err;
711 712

		if (flags & TEST_PRIORITY)
713
			h.ctx->sched.priority = 1024;
714 715
	}

716
	for_each_engine(engine, i915, id) {
717
		struct active_engine threads[I915_NUM_ENGINES] = {};
718
		unsigned long global = i915_reset_count(&i915->gpu_error);
719
		unsigned long count = 0, reported;
720 721
		IGT_TIMEOUT(end_time);

722 723
		if (flags & TEST_ACTIVE &&
		    !intel_engine_can_store_dword(engine))
724 725
			continue;

726 727 728 729 730 731 732
		if (!wait_for_idle(engine)) {
			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
			       engine->name, test_name);
			err = -EIO;
			break;
		}

733
		memset(threads, 0, sizeof(threads));
734
		for_each_engine(other, i915, tmp) {
735 736
			struct task_struct *tsk;

737 738 739
			threads[tmp].resets =
				i915_reset_engine_count(&i915->gpu_error,
							other);
740

741
			if (!(flags & TEST_OTHERS))
742 743
				continue;

744 745 746 747 748 749 750
			if (other == engine && !(flags & TEST_SELF))
				continue;

			threads[tmp].engine = other;
			threads[tmp].flags = flags;

			tsk = kthread_run(active_engine, &threads[tmp],
751
					  "igt/%s", other->name);
752 753 754 755 756
			if (IS_ERR(tsk)) {
				err = PTR_ERR(tsk);
				goto unwind;
			}

757
			threads[tmp].task = tsk;
758 759 760
			get_task_struct(tsk);
		}

761
		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
762
		do {
763 764
			u32 seqno = intel_engine_get_seqno(engine);
			struct i915_request *rq = NULL;
765

766
			if (flags & TEST_ACTIVE) {
767
				mutex_lock(&i915->drm.struct_mutex);
768
				rq = hang_create_request(&h, engine);
769 770 771 772 773 774
				if (IS_ERR(rq)) {
					err = PTR_ERR(rq);
					mutex_unlock(&i915->drm.struct_mutex);
					break;
				}

775 776
				i915_request_get(rq);
				__i915_request_add(rq, true);
777 778
				mutex_unlock(&i915->drm.struct_mutex);

779
				if (!wait_until_running(&h, rq)) {
780 781 782 783 784 785 786
					struct drm_printer p = drm_info_printer(i915->drm.dev);

					pr_err("%s: Failed to start request %x, at %x\n",
					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
					intel_engine_dump(engine, &p,
							  "%s\n", engine->name);

787
					i915_request_put(rq);
788 789 790 791
					err = -EIO;
					break;
				}

792 793
				GEM_BUG_ON(!rq->global_seqno);
				seqno = rq->global_seqno - 1;
794 795
			}

796
			err = i915_reset_engine(engine, NULL);
797
			if (err) {
798 799
				pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
				       engine->name, test_name, err);
800 801
				break;
			}
802 803

			count++;
804 805 806 807 808

			if (rq) {
				i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
				i915_request_put(rq);
			}
809 810 811 812 813 814 815 816 817 818 819 820 821 822

			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
				struct drm_printer p =
					drm_info_printer(i915->drm.dev);

				pr_err("i915_reset_engine(%s:%s):"
				       " failed to idle after reset\n",
				       engine->name, test_name);
				intel_engine_dump(engine, &p,
						  "%s\n", engine->name);

				err = -EIO;
				break;
			}
823
		} while (time_before(jiffies, end_time));
824 825
		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
826 827 828 829 830 831 832 833
			engine->name, test_name, count);

		reported = i915_reset_engine_count(&i915->gpu_error, engine);
		reported -= threads[engine->id].resets;
		if (reported != (flags & TEST_ACTIVE ? count : 0)) {
			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu, expected %lu reported\n",
			       engine->name, test_name, count, reported,
			       (flags & TEST_ACTIVE ? count : 0));
834 835 836
			if (!err)
				err = -EINVAL;
		}
837 838

unwind:
839
		for_each_engine(other, i915, tmp) {
840 841
			int ret;

842
			if (!threads[tmp].task)
843 844
				continue;

845
			ret = kthread_stop(threads[tmp].task);
846
			if (ret) {
847 848
				pr_err("kthread for other engine %s failed, err=%d\n",
				       other->name, ret);
849 850 851
				if (!err)
					err = ret;
			}
852
			put_task_struct(threads[tmp].task);
853

854 855 856
			if (other != engine &&
			    threads[tmp].resets !=
			    i915_reset_engine_count(&i915->gpu_error, other)) {
857
				pr_err("Innocent engine %s was reset (count=%ld)\n",
858
				       other->name,
859
				       i915_reset_engine_count(&i915->gpu_error,
860 861
							       other) -
				       threads[tmp].resets);
862 863
				if (!err)
					err = -EINVAL;
864 865 866 867 868 869
			}
		}

		if (global != i915_reset_count(&i915->gpu_error)) {
			pr_err("Global reset (count=%ld)!\n",
			       i915_reset_count(&i915->gpu_error) - global);
870 871
			if (!err)
				err = -EINVAL;
872 873 874 875 876
		}

		if (err)
			break;

877 878 879
		err = flush_test(i915, 0);
		if (err)
			break;
880 881 882 883 884
	}

	if (i915_terminally_wedged(&i915->gpu_error))
		err = -EIO;

885
	if (flags & TEST_ACTIVE) {
886 887 888 889 890
		mutex_lock(&i915->drm.struct_mutex);
		hang_fini(&h);
		mutex_unlock(&i915->drm.struct_mutex);
	}

891 892 893
	return err;
}

894
static int igt_reset_engines(void *arg)
895
{
896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916
	static const struct {
		const char *name;
		unsigned int flags;
	} phases[] = {
		{ "idle", 0 },
		{ "active", TEST_ACTIVE },
		{ "others-idle", TEST_OTHERS },
		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
		{
			"others-priority",
			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
		},
		{
			"self-priority",
			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
		},
		{ }
	};
	struct drm_i915_private *i915 = arg;
	typeof(*phases) *p;
	int err;
917

918 919 920 921 922 923 924 925 926 927 928 929
	for (p = phases; p->name; p++) {
		if (p->flags & TEST_PRIORITY) {
			if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
				continue;
		}

		err = __igt_reset_engines(arg, p->name, p->flags);
		if (err)
			return err;
	}

	return 0;
930 931
}

932
static u32 fake_hangcheck(struct i915_request *rq, u32 mask)
933
{
934 935
	struct i915_gpu_error *error = &rq->i915->gpu_error;
	u32 reset_count = i915_reset_count(error);
936

937
	error->stalled_mask = mask;
938

939 940 941
	/* set_bit() must be after we have setup the backchannel (mask) */
	smp_mb__before_atomic();
	set_bit(I915_RESET_HANDOFF, &error->flags);
942

943
	wake_up_all(&error->wait_queue);
944 945 946 947 948 949 950

	return reset_count;
}

static int igt_wait_reset(void *arg)
{
	struct drm_i915_private *i915 = arg;
951
	struct i915_request *rq;
952 953 954 955 956
	unsigned int reset_count;
	struct hang h;
	long timeout;
	int err;

957 958 959
	if (!intel_engine_can_store_dword(i915->engine[RCS]))
		return 0;

960 961
	/* Check that we detect a stuck waiter and issue a reset */

962
	global_reset_lock(i915);
963 964 965 966 967 968

	mutex_lock(&i915->drm.struct_mutex);
	err = hang_init(&h, i915);
	if (err)
		goto unlock;

969
	rq = hang_create_request(&h, i915->engine[RCS]);
970 971 972 973 974
	if (IS_ERR(rq)) {
		err = PTR_ERR(rq);
		goto fini;
	}

975 976
	i915_request_get(rq);
	__i915_request_add(rq, true);
977

978
	if (!wait_until_running(&h, rq)) {
979 980
		struct drm_printer p = drm_info_printer(i915->drm.dev);

981 982
		pr_err("%s: Failed to start request %x, at %x\n",
		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
983
		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
984 985 986

		i915_gem_set_wedged(i915);

987 988 989 990
		err = -EIO;
		goto out_rq;
	}

991
	reset_count = fake_hangcheck(rq, ALL_ENGINES);
992

993
	timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
994
	if (timeout < 0) {
995
		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
996 997 998 999 1000
		       timeout);
		err = timeout;
		goto out_rq;
	}

1001
	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
1002 1003 1004 1005 1006 1007 1008
	if (i915_reset_count(&i915->gpu_error) == reset_count) {
		pr_err("No GPU reset recorded!\n");
		err = -EINVAL;
		goto out_rq;
	}

out_rq:
1009
	i915_request_put(rq);
1010 1011 1012 1013
fini:
	hang_fini(&h);
unlock:
	mutex_unlock(&i915->drm.struct_mutex);
1014
	global_reset_unlock(i915);
1015 1016 1017 1018 1019 1020 1021

	if (i915_terminally_wedged(&i915->gpu_error))
		return -EIO;

	return err;
}

1022 1023 1024 1025 1026 1027 1028 1029 1030 1031
static int wait_for_others(struct drm_i915_private *i915,
			   struct intel_engine_cs *exclude)
{
	struct intel_engine_cs *engine;
	enum intel_engine_id id;

	for_each_engine(engine, i915, id) {
		if (engine == exclude)
			continue;

1032
		if (!wait_for_idle(engine))
1033 1034 1035 1036 1037 1038
			return -EIO;
	}

	return 0;
}

1039 1040 1041 1042 1043 1044 1045 1046 1047 1048
static int igt_reset_queue(void *arg)
{
	struct drm_i915_private *i915 = arg;
	struct intel_engine_cs *engine;
	enum intel_engine_id id;
	struct hang h;
	int err;

	/* Check that we replay pending requests following a hang */

1049 1050
	global_reset_lock(i915);

1051 1052 1053 1054 1055 1056
	mutex_lock(&i915->drm.struct_mutex);
	err = hang_init(&h, i915);
	if (err)
		goto unlock;

	for_each_engine(engine, i915, id) {
1057
		struct i915_request *prev;
1058 1059 1060
		IGT_TIMEOUT(end_time);
		unsigned int count;

1061 1062 1063
		if (!intel_engine_can_store_dword(engine))
			continue;

1064
		prev = hang_create_request(&h, engine);
1065 1066 1067 1068 1069
		if (IS_ERR(prev)) {
			err = PTR_ERR(prev);
			goto fini;
		}

1070 1071
		i915_request_get(prev);
		__i915_request_add(prev, true);
1072 1073 1074

		count = 0;
		do {
1075
			struct i915_request *rq;
1076 1077
			unsigned int reset_count;

1078
			rq = hang_create_request(&h, engine);
1079 1080 1081 1082 1083
			if (IS_ERR(rq)) {
				err = PTR_ERR(rq);
				goto fini;
			}

1084 1085
			i915_request_get(rq);
			__i915_request_add(rq, true);
1086

1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108
			/*
			 * XXX We don't handle resetting the kernel context
			 * very well. If we trigger a device reset twice in
			 * quick succession while the kernel context is
			 * executing, we may end up skipping the breadcrumb.
			 * This is really only a problem for the selftest as
			 * normally there is a large interlude between resets
			 * (hangcheck), or we focus on resetting just one
			 * engine and so avoid repeatedly resetting innocents.
			 */
			err = wait_for_others(i915, engine);
			if (err) {
				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
				       __func__, engine->name);
				i915_request_put(rq);
				i915_request_put(prev);

				GEM_TRACE_DUMP();
				i915_gem_set_wedged(i915);
				goto fini;
			}

1109
			if (!wait_until_running(&h, prev)) {
1110 1111
				struct drm_printer p = drm_info_printer(i915->drm.dev);

1112 1113 1114 1115 1116
				pr_err("%s(%s): Failed to start request %x, at %x\n",
				       __func__, engine->name,
				       prev->fence.seqno, hws_seqno(&h, prev));
				intel_engine_dump(engine, &p,
						  "%s\n", engine->name);
1117

1118 1119
				i915_request_put(rq);
				i915_request_put(prev);
1120 1121 1122

				i915_gem_set_wedged(i915);

1123 1124 1125 1126
				err = -EIO;
				goto fini;
			}

1127
			reset_count = fake_hangcheck(prev, ENGINE_MASK(id));
1128

1129
			i915_reset(i915, ENGINE_MASK(id), NULL);
1130

1131
			GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
1132
					    &i915->gpu_error.flags));
1133

1134 1135 1136
			if (prev->fence.error != -EIO) {
				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
				       prev->fence.error);
1137 1138
				i915_request_put(rq);
				i915_request_put(prev);
1139 1140 1141 1142 1143 1144 1145
				err = -EINVAL;
				goto fini;
			}

			if (rq->fence.error) {
				pr_err("Fence error status not zero [%d] after unrelated reset\n",
				       rq->fence.error);
1146 1147
				i915_request_put(rq);
				i915_request_put(prev);
1148 1149 1150 1151 1152 1153
				err = -EINVAL;
				goto fini;
			}

			if (i915_reset_count(&i915->gpu_error) == reset_count) {
				pr_err("No GPU reset recorded!\n");
1154 1155
				i915_request_put(rq);
				i915_request_put(prev);
1156 1157 1158 1159
				err = -EINVAL;
				goto fini;
			}

1160
			i915_request_put(prev);
1161 1162 1163 1164 1165 1166
			prev = rq;
			count++;
		} while (time_before(jiffies, end_time));
		pr_info("%s: Completed %d resets\n", engine->name, count);

		*h.batch = MI_BATCH_BUFFER_END;
1167
		i915_gem_chipset_flush(i915);
1168

1169
		i915_request_put(prev);
1170 1171 1172 1173

		err = flush_test(i915, I915_WAIT_LOCKED);
		if (err)
			break;
1174 1175 1176 1177 1178 1179
	}

fini:
	hang_fini(&h);
unlock:
	mutex_unlock(&i915->drm.struct_mutex);
1180
	global_reset_unlock(i915);
1181 1182 1183 1184 1185 1186 1187

	if (i915_terminally_wedged(&i915->gpu_error))
		return -EIO;

	return err;
}

1188
static int igt_handle_error(void *arg)
1189 1190 1191 1192
{
	struct drm_i915_private *i915 = arg;
	struct intel_engine_cs *engine = i915->engine[RCS];
	struct hang h;
1193
	struct i915_request *rq;
1194 1195
	struct i915_gpu_state *error;
	int err;
1196 1197 1198 1199 1200 1201

	/* Check that we can issue a global GPU and engine reset */

	if (!intel_has_reset_engine(i915))
		return 0;

1202
	if (!engine || !intel_engine_can_store_dword(engine))
1203 1204
		return 0;

1205 1206 1207 1208
	mutex_lock(&i915->drm.struct_mutex);

	err = hang_init(&h, i915);
	if (err)
1209
		goto err_unlock;
1210

1211
	rq = hang_create_request(&h, engine);
1212 1213
	if (IS_ERR(rq)) {
		err = PTR_ERR(rq);
1214
		goto err_fini;
1215 1216
	}

1217 1218
	i915_request_get(rq);
	__i915_request_add(rq, true);
1219

1220
	if (!wait_until_running(&h, rq)) {
1221 1222
		struct drm_printer p = drm_info_printer(i915->drm.dev);

1223 1224
		pr_err("%s: Failed to start request %x, at %x\n",
		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1225
		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1226 1227 1228

		i915_gem_set_wedged(i915);

1229
		err = -EIO;
1230
		goto err_request;
1231 1232 1233 1234
	}

	mutex_unlock(&i915->drm.struct_mutex);

1235 1236
	/* Temporarily disable error capture */
	error = xchg(&i915->gpu_error.first_error, (void *)-1);
1237

1238
	i915_handle_error(i915, ENGINE_MASK(engine->id), 0, NULL);
1239

1240
	xchg(&i915->gpu_error.first_error, error);
1241

1242
	mutex_lock(&i915->drm.struct_mutex);
1243

1244 1245 1246 1247 1248
	if (rq->fence.error != -EIO) {
		pr_err("Guilty request not identified!\n");
		err = -EINVAL;
		goto err_request;
	}
1249 1250

err_request:
1251
	i915_request_put(rq);
1252 1253 1254 1255
err_fini:
	hang_fini(&h);
err_unlock:
	mutex_unlock(&i915->drm.struct_mutex);
1256
	return err;
1257 1258
}

1259 1260 1261
int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
{
	static const struct i915_subtest tests[] = {
1262
		SUBTEST(igt_global_reset), /* attempt to recover GPU first */
1263
		SUBTEST(igt_hang_sanitycheck),
1264 1265
		SUBTEST(igt_reset_idle_engine),
		SUBTEST(igt_reset_active_engine),
1266
		SUBTEST(igt_reset_engines),
1267 1268
		SUBTEST(igt_wait_reset),
		SUBTEST(igt_reset_queue),
1269
		SUBTEST(igt_handle_error),
1270
	};
1271
	bool saved_hangcheck;
1272
	int err;
1273 1274 1275 1276

	if (!intel_has_gpu_reset(i915))
		return 0;

1277
	intel_runtime_pm_get(i915);
1278
	saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1279 1280 1281

	err = i915_subtests(tests, i915);

1282 1283 1284 1285
	mutex_lock(&i915->drm.struct_mutex);
	flush_test(i915, I915_WAIT_LOCKED);
	mutex_unlock(&i915->drm.struct_mutex);

1286
	i915_modparams.enable_hangcheck = saved_hangcheck;
1287 1288 1289
	intel_runtime_pm_put(i915);

	return err;
1290
}