i915_gpu_error.c 45.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
/*
 * Copyright (c) 2008 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 *
 * Authors:
 *    Eric Anholt <eric@anholt.net>
 *    Keith Packard <keithp@keithp.com>
 *    Mika Kuoppala <mika.kuoppala@intel.com>
 *
 */

C
Chris Wilson 已提交
30 31
#include <linux/ascii85.h>
#include <linux/nmi.h>
32
#include <linux/pagevec.h>
C
Chris Wilson 已提交
33 34
#include <linux/scatterlist.h>
#include <linux/utsname.h>
35
#include <linux/zlib.h>
C
Chris Wilson 已提交
36

37 38
#include <drm/drm_print.h>

39
#include "display/intel_csr.h"
40 41
#include "display/intel_overlay.h"

42
#include "gem/i915_gem_context.h"
43
#include "gem/i915_gem_lmem.h"
44
#include "gt/intel_gt.h"
45
#include "gt/intel_gt_pm.h"
46

47
#include "i915_drv.h"
48
#include "i915_gpu_error.h"
49
#include "i915_memcpy.h"
50
#include "i915_scatterlist.h"
51

52 53 54
#define ALLOW_FAIL (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)
#define ATOMIC_MAYFAIL (GFP_ATOMIC | __GFP_NOWARN)

C
Chris Wilson 已提交
55 56
static void __sg_set_buf(struct scatterlist *sg,
			 void *addr, unsigned int len, loff_t it)
57
{
C
Chris Wilson 已提交
58 59 60 61
	sg->page_link = (unsigned long)virt_to_page(addr);
	sg->offset = offset_in_page(addr);
	sg->length = len;
	sg->dma_address = it;
62 63
}

C
Chris Wilson 已提交
64
static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
65
{
C
Chris Wilson 已提交
66
	if (!len)
67 68
		return false;

C
Chris Wilson 已提交
69 70 71 72 73 74 75 76
	if (e->bytes + len + 1 <= e->size)
		return true;

	if (e->bytes) {
		__sg_set_buf(e->cur++, e->buf, e->bytes, e->iter);
		e->iter += e->bytes;
		e->buf = NULL;
		e->bytes = 0;
77 78
	}

C
Chris Wilson 已提交
79 80
	if (e->cur == e->end) {
		struct scatterlist *sgl;
81

82
		sgl = (typeof(sgl))__get_free_page(ALLOW_FAIL);
C
Chris Wilson 已提交
83 84 85 86
		if (!sgl) {
			e->err = -ENOMEM;
			return false;
		}
87

C
Chris Wilson 已提交
88 89 90 91 92 93 94
		if (e->cur) {
			e->cur->offset = 0;
			e->cur->length = 0;
			e->cur->page_link =
				(unsigned long)sgl | SG_CHAIN;
		} else {
			e->sgl = sgl;
95 96
		}

C
Chris Wilson 已提交
97 98
		e->cur = sgl;
		e->end = sgl + SG_MAX_SINGLE_ALLOC - 1;
99 100
	}

C
Chris Wilson 已提交
101
	e->size = ALIGN(len + 1, SZ_64K);
102
	e->buf = kmalloc(e->size, ALLOW_FAIL);
C
Chris Wilson 已提交
103 104 105 106 107 108 109 110 111 112
	if (!e->buf) {
		e->size = PAGE_ALIGN(len + 1);
		e->buf = kmalloc(e->size, GFP_KERNEL);
	}
	if (!e->buf) {
		e->err = -ENOMEM;
		return false;
	}

	return true;
113 114
}

115
__printf(2, 0)
116
static void i915_error_vprintf(struct drm_i915_error_state_buf *e,
C
Chris Wilson 已提交
117
			       const char *fmt, va_list args)
118
{
C
Chris Wilson 已提交
119 120
	va_list ap;
	int len;
121

C
Chris Wilson 已提交
122
	if (e->err)
123 124
		return;

C
Chris Wilson 已提交
125 126 127 128 129 130
	va_copy(ap, args);
	len = vsnprintf(NULL, 0, fmt, ap);
	va_end(ap);
	if (len <= 0) {
		e->err = len;
		return;
131 132
	}

C
Chris Wilson 已提交
133 134
	if (!__i915_error_grow(e, len))
		return;
135

C
Chris Wilson 已提交
136 137 138 139 140 141 142
	GEM_BUG_ON(e->bytes >= e->size);
	len = vscnprintf(e->buf + e->bytes, e->size - e->bytes, fmt, args);
	if (len < 0) {
		e->err = len;
		return;
	}
	e->bytes += len;
143 144
}

C
Chris Wilson 已提交
145
static void i915_error_puts(struct drm_i915_error_state_buf *e, const char *str)
146 147 148
{
	unsigned len;

C
Chris Wilson 已提交
149
	if (e->err || !str)
150 151 152
		return;

	len = strlen(str);
C
Chris Wilson 已提交
153 154
	if (!__i915_error_grow(e, len))
		return;
155

C
Chris Wilson 已提交
156
	GEM_BUG_ON(e->bytes + len > e->size);
157
	memcpy(e->buf + e->bytes, str, len);
C
Chris Wilson 已提交
158
	e->bytes += len;
159 160 161 162 163
}

#define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__)
#define err_puts(e, s) i915_error_puts(e, s)

164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf)
{
	i915_error_vprintf(p->arg, vaf->fmt, *vaf->va);
}

static inline struct drm_printer
i915_error_printer(struct drm_i915_error_state_buf *e)
{
	struct drm_printer p = {
		.printfn = __i915_printfn_error,
		.arg = e,
	};
	return p;
}

179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
/* single threaded page allocator with a reserved stash for emergencies */
static void pool_fini(struct pagevec *pv)
{
	pagevec_release(pv);
}

static int pool_refill(struct pagevec *pv, gfp_t gfp)
{
	while (pagevec_space(pv)) {
		struct page *p;

		p = alloc_page(gfp);
		if (!p)
			return -ENOMEM;

		pagevec_add(pv, p);
	}

	return 0;
}

static int pool_init(struct pagevec *pv, gfp_t gfp)
{
	int err;

	pagevec_init(pv);

	err = pool_refill(pv, gfp);
	if (err)
		pool_fini(pv);

	return err;
}

static void *pool_alloc(struct pagevec *pv, gfp_t gfp)
{
	struct page *p;

	p = alloc_page(gfp);
	if (!p && pagevec_count(pv))
		p = pv->pages[--pv->nr];

	return p ? page_address(p) : NULL;
}

static void pool_free(struct pagevec *pv, void *addr)
{
	struct page *p = virt_to_page(addr);

	if (pagevec_space(pv))
		pagevec_add(pv, p);
	else
		__free_page(p);
}

234 235
#ifdef CONFIG_DRM_I915_COMPRESS_ERROR

236
struct i915_vma_compress {
237
	struct pagevec pool;
238 239 240 241
	struct z_stream_s zstream;
	void *tmp;
};

242
static bool compress_init(struct i915_vma_compress *c)
243
{
244
	struct z_stream_s *zstream = &c->zstream;
245

246
	if (pool_init(&c->pool, ALLOW_FAIL))
247 248
		return false;

249 250 251 252 253
	zstream->workspace =
		kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
			ALLOW_FAIL);
	if (!zstream->workspace) {
		pool_fini(&c->pool);
254 255 256
		return false;
	}

257
	c->tmp = NULL;
258
	if (i915_has_memcpy_from_wc())
259
		c->tmp = pool_alloc(&c->pool, ALLOW_FAIL);
260

261 262 263
	return true;
}

264
static bool compress_start(struct i915_vma_compress *c)
265
{
266 267 268 269 270 271 272 273 274
	struct z_stream_s *zstream = &c->zstream;
	void *workspace = zstream->workspace;

	memset(zstream, 0, sizeof(*zstream));
	zstream->workspace = workspace;

	return zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) == Z_OK;
}

275 276
static void *compress_next_page(struct i915_vma_compress *c,
				struct i915_vma_coredump *dst)
277 278
{
	void *page;
279 280 281 282

	if (dst->page_count >= dst->num_pages)
		return ERR_PTR(-ENOSPC);

283
	page = pool_alloc(&c->pool, ALLOW_FAIL);
284 285 286
	if (!page)
		return ERR_PTR(-ENOMEM);

287
	return dst->pages[dst->page_count++] = page;
288 289
}

290
static int compress_page(struct i915_vma_compress *c,
291
			 void *src,
292 293
			 struct i915_vma_coredump *dst,
			 bool wc)
294
{
295 296
	struct z_stream_s *zstream = &c->zstream;

297
	zstream->next_in = src;
298
	if (wc && c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
299
		zstream->next_in = c->tmp;
300 301 302 303
	zstream->avail_in = PAGE_SIZE;

	do {
		if (zstream->avail_out == 0) {
304
			zstream->next_out = compress_next_page(c, dst);
305 306
			if (IS_ERR(zstream->next_out))
				return PTR_ERR(zstream->next_out);
307 308 309 310

			zstream->avail_out = PAGE_SIZE;
		}

311
		if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK)
312
			return -EIO;
313 314

		cond_resched();
315 316 317 318 319 320 321 322 323
	} while (zstream->avail_in);

	/* Fallback to uncompressed if we increase size? */
	if (0 && zstream->total_out > zstream->total_in)
		return -E2BIG;

	return 0;
}

324 325
static int compress_flush(struct i915_vma_compress *c,
			  struct i915_vma_coredump *dst)
326
{
327 328
	struct z_stream_s *zstream = &c->zstream;

329 330 331
	do {
		switch (zlib_deflate(zstream, Z_FINISH)) {
		case Z_OK: /* more space requested */
332
			zstream->next_out = compress_next_page(c, dst);
333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352
			if (IS_ERR(zstream->next_out))
				return PTR_ERR(zstream->next_out);

			zstream->avail_out = PAGE_SIZE;
			break;

		case Z_STREAM_END:
			goto end;

		default: /* any error */
			return -EIO;
		}
	} while (1);

end:
	memset(zstream->next_out, 0, zstream->avail_out);
	dst->unused = zstream->avail_out;
	return 0;
}

353
static void compress_finish(struct i915_vma_compress *c)
354
{
355 356
	zlib_deflateEnd(&c->zstream);
}
357

358
static void compress_fini(struct i915_vma_compress *c)
359 360
{
	kfree(c->zstream.workspace);
361
	if (c->tmp)
362 363
		pool_free(&c->pool, c->tmp);
	pool_fini(&c->pool);
364 365 366 367 368 369 370 371 372
}

static void err_compression_marker(struct drm_i915_error_state_buf *m)
{
	err_puts(m, ":");
}

#else

373
struct i915_vma_compress {
374
	struct pagevec pool;
375 376
};

377
static bool compress_init(struct i915_vma_compress *c)
378 379 380 381
{
	return pool_init(&c->pool, ALLOW_FAIL) == 0;
}

382
static bool compress_start(struct i915_vma_compress *c)
383 384 385 386
{
	return true;
}

387
static int compress_page(struct i915_vma_compress *c,
388
			 void *src,
389 390
			 struct i915_vma_coredump *dst,
			 bool wc)
391
{
392
	void *ptr;
393

394
	ptr = pool_alloc(&c->pool, ALLOW_FAIL);
395
	if (!ptr)
396 397
		return -ENOMEM;

398
	if (!(wc && i915_memcpy_from_wc(ptr, src, PAGE_SIZE)))
399 400
		memcpy(ptr, src, PAGE_SIZE);
	dst->pages[dst->page_count++] = ptr;
401
	cond_resched();
402 403 404 405

	return 0;
}

406 407
static int compress_flush(struct i915_vma_compress *c,
			  struct i915_vma_coredump *dst)
408 409 410 411
{
	return 0;
}

412
static void compress_finish(struct i915_vma_compress *c)
413 414 415
{
}

416
static void compress_fini(struct i915_vma_compress *c)
417 418 419 420
{
	pool_fini(&c->pool);
}

421 422 423 424 425 426 427
static void err_compression_marker(struct drm_i915_error_state_buf *m)
{
	err_puts(m, "~");
}

#endif

428
static void error_print_instdone(struct drm_i915_error_state_buf *m,
429
				 const struct intel_engine_coredump *ee)
430
{
431
	const struct sseu_dev_info *sseu = &ee->engine->gt->info.sseu;
432 433 434
	int slice;
	int subslice;

435 436 437
	err_printf(m, "  INSTDONE: 0x%08x\n",
		   ee->instdone.instdone);

438
	if (ee->engine->class != RENDER_CLASS || INTEL_GEN(m->i915) <= 3)
439 440 441 442 443 444 445 446
		return;

	err_printf(m, "  SC_INSTDONE: 0x%08x\n",
		   ee->instdone.slice_common);

	if (INTEL_GEN(m->i915) <= 6)
		return;

447
	for_each_instdone_slice_subslice(m->i915, sseu, slice, subslice)
448 449 450 451
		err_printf(m, "  SAMPLER_INSTDONE[%d][%d]: 0x%08x\n",
			   slice, subslice,
			   ee->instdone.sampler[slice][subslice]);

452
	for_each_instdone_slice_subslice(m->i915, sseu, slice, subslice)
453 454 455
		err_printf(m, "  ROW_INSTDONE[%d][%d]: 0x%08x\n",
			   slice, subslice,
			   ee->instdone.row[slice][subslice]);
456 457 458 459 460 461 462 463

	if (INTEL_GEN(m->i915) < 12)
		return;

	err_printf(m, "  SC_INSTDONE_EXTRA: 0x%08x\n",
		   ee->instdone.slice_common_extra[0]);
	err_printf(m, "  SC_INSTDONE_EXTRA2: 0x%08x\n",
		   ee->instdone.slice_common_extra[1]);
464 465
}

466 467
static void error_print_request(struct drm_i915_error_state_buf *m,
				const char *prefix,
468
				const struct i915_request_coredump *erq)
469 470 471 472
{
	if (!erq->seqno)
		return;

473
	err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, head %08x, tail %08x\n",
474
		   prefix, erq->pid, erq->context, erq->seqno,
475 476 477 478 479
		   test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
			    &erq->flags) ? "!" : "",
		   test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
			    &erq->flags) ? "+" : "",
		   erq->sched_attr.priority,
480
		   erq->head, erq->tail);
481 482
}

483 484
static void error_print_context(struct drm_i915_error_state_buf *m,
				const char *header,
485
				const struct i915_gem_context_coredump *ctx)
486
{
487
	const u32 period = m->i915->gt.clock_period_ns;
488 489

	err_printf(m, "%s%s[%d] prio %d, guilty %d active %d, runtime total %lluns, avg %lluns\n",
C
Chris Wilson 已提交
490
		   header, ctx->comm, ctx->pid, ctx->sched_attr.priority,
491 492 493
		   ctx->guilty, ctx->active,
		   ctx->total_runtime * period,
		   mul_u32_u32(ctx->avg_runtime, period));
494 495
}

496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513
static struct i915_vma_coredump *
__find_vma(struct i915_vma_coredump *vma, const char *name)
{
	while (vma) {
		if (strcmp(vma->name, name) == 0)
			return vma;
		vma = vma->next;
	}

	return NULL;
}

static struct i915_vma_coredump *
find_batch(const struct intel_engine_coredump *ee)
{
	return __find_vma(ee->vma, "batch");
}

514
static void error_print_engine(struct drm_i915_error_state_buf *m,
515
			       const struct intel_engine_coredump *ee)
516
{
517
	struct i915_vma_coredump *batch;
518 519
	int n;

520
	err_printf(m, "%s command stream:\n", ee->engine->name);
521
	err_printf(m, "  CCID:  0x%08x\n", ee->ccid);
522
	err_printf(m, "  START: 0x%08x\n", ee->start);
523
	err_printf(m, "  HEAD:  0x%08x [0x%08x]\n", ee->head, ee->rq_head);
524 525
	err_printf(m, "  TAIL:  0x%08x [0x%08x, 0x%08x]\n",
		   ee->tail, ee->rq_post, ee->rq_tail);
526
	err_printf(m, "  CTL:   0x%08x\n", ee->ctl);
527
	err_printf(m, "  MODE:  0x%08x\n", ee->mode);
528 529 530 531 532
	err_printf(m, "  HWS:   0x%08x\n", ee->hws);
	err_printf(m, "  ACTHD: 0x%08x %08x\n",
		   (u32)(ee->acthd>>32), (u32)ee->acthd);
	err_printf(m, "  IPEIR: 0x%08x\n", ee->ipeir);
	err_printf(m, "  IPEHR: 0x%08x\n", ee->ipehr);
533
	err_printf(m, "  ESR:   0x%08x\n", ee->esr);
534 535 536

	error_print_instdone(m, ee);

537 538 539 540
	batch = find_batch(ee);
	if (batch) {
		u64 start = batch->gtt_offset;
		u64 end = start + batch->gtt_size;
541 542 543 544 545

		err_printf(m, "  batch: [0x%08x_%08x, 0x%08x_%08x]\n",
			   upper_32_bits(start), lower_32_bits(start),
			   upper_32_bits(end), lower_32_bits(end));
	}
546
	if (INTEL_GEN(m->i915) >= 4) {
547
		err_printf(m, "  BBADDR: 0x%08x_%08x\n",
548 549 550
			   (u32)(ee->bbaddr>>32), (u32)ee->bbaddr);
		err_printf(m, "  BB_STATE: 0x%08x\n", ee->bbstate);
		err_printf(m, "  INSTPS: 0x%08x\n", ee->instps);
551
	}
552 553 554 555 556 557
	err_printf(m, "  INSTPM: 0x%08x\n", ee->instpm);
	err_printf(m, "  FADDR: 0x%08x %08x\n", upper_32_bits(ee->faddr),
		   lower_32_bits(ee->faddr));
	if (INTEL_GEN(m->i915) >= 6) {
		err_printf(m, "  RC PSMI: 0x%08x\n", ee->rc_psmi);
		err_printf(m, "  FAULT_REG: 0x%08x\n", ee->fault_reg);
558
	}
559
	if (HAS_PPGTT(m->i915)) {
560
		err_printf(m, "  GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode);
561

562
		if (INTEL_GEN(m->i915) >= 8) {
563 564 565
			int i;
			for (i = 0; i < 4; i++)
				err_printf(m, "  PDP%d: 0x%016llx\n",
566
					   i, ee->vm_info.pdp[i]);
567 568
		} else {
			err_printf(m, "  PP_DIR_BASE: 0x%08x\n",
569
				   ee->vm_info.pp_dir_base);
570 571
		}
	}
572
	err_printf(m, "  hung: %u\n", ee->hung);
573
	err_printf(m, "  engine reset count: %u\n", ee->reset_count);
574

575 576
	for (n = 0; n < ee->num_ports; n++) {
		err_printf(m, "  ELSP[%d]:", n);
577
		error_print_request(m, " ", &ee->execlist[n]);
578 579
	}

580
	error_print_context(m, "  Active context: ", &ee->context);
581 582 583 584 585 586 587 588 589 590 591
}

void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
{
	va_list args;

	va_start(args, f);
	i915_error_vprintf(e, f, args);
	va_end(args);
}

592
static void print_error_vma(struct drm_i915_error_state_buf *m,
593
			    const struct intel_engine_cs *engine,
594
			    const struct i915_vma_coredump *vma)
595
{
596
	char out[ASCII85_BUFSZ];
597
	int page;
598

599
	if (!vma)
600 601
		return;

602 603 604 605
	err_printf(m, "%s --- %s = 0x%08x %08x\n",
		   engine ? engine->name : "global", vma->name,
		   upper_32_bits(vma->gtt_offset),
		   lower_32_bits(vma->gtt_offset));
606

607 608
	if (vma->gtt_page_sizes > I915_GTT_PAGE_SIZE_4K)
		err_printf(m, "gtt_page_sizes = 0x%08x\n", vma->gtt_page_sizes);
609

610
	err_compression_marker(m);
611
	for (page = 0; page < vma->page_count; page++) {
612 613 614
		int i, len;

		len = PAGE_SIZE;
615 616
		if (page == vma->page_count - 1)
			len -= vma->unused;
617 618
		len = ascii85_encode_len(len);

619
		for (i = 0; i < len; i++)
620
			err_puts(m, ascii85_encode(vma->pages[page][i], out));
621
	}
622
	err_puts(m, "\n");
623 624
}

625
static void err_print_capabilities(struct drm_i915_error_state_buf *m,
626
				   struct i915_gpu_coredump *error)
627
{
628 629
	struct drm_printer p = i915_error_printer(m);

630 631 632
	intel_device_info_print_static(&error->device_info, &p);
	intel_device_info_print_runtime(&error->runtime_info, &p);
	intel_driver_caps_print(&error->driver_caps, &p);
633 634
}

635
static void err_print_params(struct drm_i915_error_state_buf *m,
636
			     const struct i915_params *params)
637
{
638 639 640
	struct drm_printer p = i915_error_printer(m);

	i915_params_dump(params, &p);
641 642
}

643 644 645
static void err_print_pciid(struct drm_i915_error_state_buf *m,
			    struct drm_i915_private *i915)
{
646
	struct pci_dev *pdev = to_pci_dev(i915->drm.dev);
647 648 649 650 651 652 653 654

	err_printf(m, "PCI ID: 0x%04x\n", pdev->device);
	err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision);
	err_printf(m, "PCI Subsystem: %04x:%04x\n",
		   pdev->subsystem_vendor,
		   pdev->subsystem_device);
}

655
static void err_print_uc(struct drm_i915_error_state_buf *m,
656
			 const struct intel_uc_coredump *error_uc)
657 658 659 660 661
{
	struct drm_printer p = i915_error_printer(m);

	intel_uc_fw_dump(&error_uc->guc_fw, &p);
	intel_uc_fw_dump(&error_uc->huc_fw, &p);
662
	print_error_vma(m, NULL, error_uc->guc_log);
663 664
}

C
Chris Wilson 已提交
665
static void err_free_sgl(struct scatterlist *sgl)
666
{
C
Chris Wilson 已提交
667 668
	while (sgl) {
		struct scatterlist *sg;
669

C
Chris Wilson 已提交
670 671 672 673 674 675 676 677 678
		for (sg = sgl; !sg_is_chain(sg); sg++) {
			kfree(sg_virt(sg));
			if (sg_is_last(sg))
				break;
		}

		sg = sg_is_last(sg) ? NULL : sg_chain_ptr(sg);
		free_page((unsigned long)sgl);
		sgl = sg;
679
	}
C
Chris Wilson 已提交
680
}
681

682 683 684 685 686 687 688 689 690
static void err_print_gt_info(struct drm_i915_error_state_buf *m,
			      struct intel_gt_coredump *gt)
{
	struct drm_printer p = i915_error_printer(m);

	intel_gt_info_print(&gt->info, &p);
	intel_sseu_print_topology(&gt->info.sseu, &p);
}

691 692 693 694
static void err_print_gt(struct drm_i915_error_state_buf *m,
			 struct intel_gt_coredump *gt)
{
	const struct intel_engine_coredump *ee;
695
	int i;
696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746

	err_printf(m, "GT awake: %s\n", yesno(gt->awake));
	err_printf(m, "EIR: 0x%08x\n", gt->eir);
	err_printf(m, "IER: 0x%08x\n", gt->ier);
	for (i = 0; i < gt->ngtier; i++)
		err_printf(m, "GTIER[%d]: 0x%08x\n", i, gt->gtier[i]);
	err_printf(m, "PGTBL_ER: 0x%08x\n", gt->pgtbl_er);
	err_printf(m, "FORCEWAKE: 0x%08x\n", gt->forcewake);
	err_printf(m, "DERRMR: 0x%08x\n", gt->derrmr);

	for (i = 0; i < gt->nfence; i++)
		err_printf(m, "  fence[%d] = %08llx\n", i, gt->fence[i]);

	if (IS_GEN_RANGE(m->i915, 6, 11)) {
		err_printf(m, "ERROR: 0x%08x\n", gt->error);
		err_printf(m, "DONE_REG: 0x%08x\n", gt->done_reg);
	}

	if (INTEL_GEN(m->i915) >= 8)
		err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n",
			   gt->fault_data1, gt->fault_data0);

	if (IS_GEN(m->i915, 7))
		err_printf(m, "ERR_INT: 0x%08x\n", gt->err_int);

	if (IS_GEN_RANGE(m->i915, 8, 11))
		err_printf(m, "GTT_CACHE_EN: 0x%08x\n", gt->gtt_cache);

	if (IS_GEN(m->i915, 12))
		err_printf(m, "AUX_ERR_DBG: 0x%08x\n", gt->aux_err);

	if (INTEL_GEN(m->i915) >= 12) {
		int i;

		for (i = 0; i < GEN12_SFC_DONE_MAX; i++)
			err_printf(m, "  SFC_DONE[%d]: 0x%08x\n", i,
				   gt->sfc_done[i]);

		err_printf(m, "  GAM_DONE: 0x%08x\n", gt->gam_done);
	}

	for (ee = gt->engine; ee; ee = ee->next) {
		const struct i915_vma_coredump *vma;

		error_print_engine(m, ee);
		for (vma = ee->vma; vma; vma = vma->next)
			print_error_vma(m, ee->engine, vma);
	}

	if (gt->uc)
		err_print_uc(m, gt->uc);
747 748

	err_print_gt_info(m, gt);
749 750
}

C
Chris Wilson 已提交
751
static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
752
			       struct i915_gpu_coredump *error)
C
Chris Wilson 已提交
753
{
754
	const struct intel_engine_coredump *ee;
C
Chris Wilson 已提交
755
	struct timespec64 ts;
756

757 758
	if (*error->error_msg)
		err_printf(m, "%s\n", error->error_msg);
759 760 761
	err_printf(m, "Kernel: %s %s\n",
		   init_utsname()->release,
		   init_utsname()->machine);
762
	err_printf(m, "Driver: %s\n", DRIVER_DATE);
A
Arnd Bergmann 已提交
763 764 765 766 767 768 769 770 771
	ts = ktime_to_timespec64(error->time);
	err_printf(m, "Time: %lld s %ld us\n",
		   (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
	ts = ktime_to_timespec64(error->boottime);
	err_printf(m, "Boottime: %lld s %ld us\n",
		   (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
	ts = ktime_to_timespec64(error->uptime);
	err_printf(m, "Uptime: %lld s %ld us\n",
		   (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
772 773
	err_printf(m, "Capture: %lu jiffies; %d ms ago\n",
		   error->capture, jiffies_to_msecs(jiffies - error->capture));
774

775
	for (ee = error->gt ? error->gt->engine : NULL; ee; ee = ee->next)
776
		err_printf(m, "Active process (on ring %s): %s [%d]\n",
777 778 779 780
			   ee->engine->name,
			   ee->context.comm,
			   ee->context.pid);

781
	err_printf(m, "Reset count: %u\n", error->reset_count);
782
	err_printf(m, "Suspend count: %u\n", error->suspend_count);
783
	err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform));
784 785 786
	err_printf(m, "Subplatform: 0x%x\n",
		   intel_subplatform(&error->runtime_info,
				     error->device_info.platform));
C
Chris Wilson 已提交
787
	err_print_pciid(m, m->i915);
788

789
	err_printf(m, "IOMMU enabled?: %d\n", error->iommu);
790

C
Chris Wilson 已提交
791
	if (HAS_CSR(m->i915)) {
792
		struct intel_dmc *dmc = &m->i915->dmc;
793 794

		err_printf(m, "DMC loaded: %s\n",
795
			   yesno(dmc->dmc_payload));
796
		err_printf(m, "DMC fw version: %d.%d\n",
797 798
			   CSR_VERSION_MAJOR(dmc->version),
			   CSR_VERSION_MINOR(dmc->version));
799 800
	}

801 802
	err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock));
	err_printf(m, "PM suspended: %s\n", yesno(error->suspended));
803

804 805
	if (error->gt)
		err_print_gt(m, error->gt);
806 807 808 809

	if (error->overlay)
		intel_overlay_print_error_state(m, error->overlay);

810
	err_print_capabilities(m, error);
811
	err_print_params(m, &error->params);
C
Chris Wilson 已提交
812 813
}

814
static int err_print_to_sgl(struct i915_gpu_coredump *error)
C
Chris Wilson 已提交
815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843
{
	struct drm_i915_error_state_buf m;

	if (IS_ERR(error))
		return PTR_ERR(error);

	if (READ_ONCE(error->sgl))
		return 0;

	memset(&m, 0, sizeof(m));
	m.i915 = error->i915;

	__err_print_to_sgl(&m, error);

	if (m.buf) {
		__sg_set_buf(m.cur++, m.buf, m.bytes, m.iter);
		m.bytes = 0;
		m.buf = NULL;
	}
	if (m.cur) {
		GEM_BUG_ON(m.end < m.cur);
		sg_mark_end(m.cur - 1);
	}
	GEM_BUG_ON(m.sgl && !m.cur);

	if (m.err) {
		err_free_sgl(m.sgl);
		return m.err;
	}
844

C
Chris Wilson 已提交
845 846
	if (cmpxchg(&error->sgl, NULL, m.sgl))
		err_free_sgl(m.sgl);
847 848 849 850

	return 0;
}

851 852
ssize_t i915_gpu_coredump_copy_to_buffer(struct i915_gpu_coredump *error,
					 char *buf, loff_t off, size_t rem)
853
{
C
Chris Wilson 已提交
854 855 856 857
	struct scatterlist *sg;
	size_t count;
	loff_t pos;
	int err;
858

C
Chris Wilson 已提交
859 860
	if (!error || !rem)
		return 0;
861

C
Chris Wilson 已提交
862 863 864
	err = err_print_to_sgl(error);
	if (err)
		return err;
865

C
Chris Wilson 已提交
866 867 868 869 870
	sg = READ_ONCE(error->fit);
	if (!sg || off < sg->dma_address)
		sg = error->sgl;
	if (!sg)
		return 0;
871

C
Chris Wilson 已提交
872 873 874 875 876 877 878 879 880
	pos = sg->dma_address;
	count = 0;
	do {
		size_t len, start;

		if (sg_is_chain(sg)) {
			sg = sg_chain_ptr(sg);
			GEM_BUG_ON(sg_is_chain(sg));
		}
881

C
Chris Wilson 已提交
882 883 884 885 886
		len = sg->length;
		if (pos + len <= off) {
			pos += len;
			continue;
		}
887

C
Chris Wilson 已提交
888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912
		start = sg->offset;
		if (pos < off) {
			GEM_BUG_ON(off - pos > len);
			len -= off - pos;
			start += off - pos;
			pos = off;
		}

		len = min(len, rem);
		GEM_BUG_ON(!len || len > sg->length);

		memcpy(buf, page_address(sg_page(sg)) + start, len);

		count += len;
		pos += len;

		buf += len;
		rem -= len;
		if (!rem) {
			WRITE_ONCE(error->fit, sg);
			break;
		}
	} while (!sg_is_last(sg++));

	return count;
913 914
}

915
static void i915_vma_coredump_free(struct i915_vma_coredump *vma)
916
{
917 918 919
	while (vma) {
		struct i915_vma_coredump *next = vma->next;
		int page;
920

921 922
		for (page = 0; page < vma->page_count; page++)
			free_page((unsigned long)vma->pages[page]);
923

924 925 926
		kfree(vma);
		vma = next;
	}
927 928
}

929
static void cleanup_params(struct i915_gpu_coredump *error)
930
{
931
	i915_params_free(&error->params);
932 933
}

934
static void cleanup_uc(struct intel_uc_coredump *uc)
935
{
936 937 938
	kfree(uc->guc_fw.path);
	kfree(uc->huc_fw.path);
	i915_vma_coredump_free(uc->guc_log);
939

940
	kfree(uc);
941 942
}

943
static void cleanup_gt(struct intel_gt_coredump *gt)
944
{
945 946 947 948
	while (gt->engine) {
		struct intel_engine_coredump *ee = gt->engine;

		gt->engine = ee->next;
949

950 951 952
		i915_vma_coredump_free(ee->vma);
		kfree(ee);
	}
953

954 955
	if (gt->uc)
		cleanup_uc(gt->uc);
956

957 958
	kfree(gt);
}
959

960 961 962 963
void __i915_gpu_coredump_free(struct kref *error_ref)
{
	struct i915_gpu_coredump *error =
		container_of(error_ref, typeof(*error), ref);
964

965 966 967 968 969
	while (error->gt) {
		struct intel_gt_coredump *gt = error->gt;

		error->gt = gt->next;
		cleanup_gt(gt);
970 971 972
	}

	kfree(error->overlay);
973

974
	cleanup_params(error);
975

C
Chris Wilson 已提交
976
	err_free_sgl(error->sgl);
977 978 979
	kfree(error);
}

980 981 982 983 984
static struct i915_vma_coredump *
i915_vma_coredump_create(const struct intel_gt *gt,
			 const struct i915_vma *vma,
			 const char *name,
			 struct i915_vma_compress *compress)
985
{
986
	struct i915_ggtt *ggtt = gt->ggtt;
987
	const u64 slot = ggtt->error_capture.start;
988
	struct i915_vma_coredump *dst;
989 990
	unsigned long num_pages;
	struct sgt_iter iter;
991
	int ret;
992

993 994
	might_sleep();

995
	if (!vma || !vma->pages || !compress)
C
Chris Wilson 已提交
996 997
		return NULL;

998
	num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
999
	num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
1000
	dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *), ALLOW_FAIL);
C
Chris Wilson 已提交
1001
	if (!dst)
1002 1003
		return NULL;

1004 1005 1006 1007 1008
	if (!compress_start(compress)) {
		kfree(dst);
		return NULL;
	}

1009 1010 1011
	strcpy(dst->name, name);
	dst->next = NULL;

1012 1013
	dst->gtt_offset = vma->node.start;
	dst->gtt_size = vma->node.size;
1014
	dst->gtt_page_sizes = vma->page_sizes.gtt;
1015
	dst->num_pages = num_pages;
1016
	dst->page_count = 0;
1017 1018
	dst->unused = 0;

1019
	ret = -EINVAL;
1020
	if (drm_mm_node_allocated(&ggtt->error_capture)) {
1021
		void __iomem *s;
1022
		dma_addr_t dma;
1023

1024
		for_each_sgt_daddr(dma, iter, vma->pages) {
1025
			mutex_lock(&ggtt->error_mutex);
1026 1027
			ggtt->vm.insert_page(&ggtt->vm, dma, slot,
					     I915_CACHE_NONE, 0);
1028
			mb();
1029

1030
			s = io_mapping_map_wc(&ggtt->iomap, slot, PAGE_SIZE);
1031 1032 1033
			ret = compress_page(compress,
					    (void  __force *)s, dst,
					    true);
1034
			io_mapping_unmap(s);
1035 1036 1037 1038

			mb();
			ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE);
			mutex_unlock(&ggtt->error_mutex);
1039 1040 1041 1042 1043 1044 1045 1046 1047 1048
			if (ret)
				break;
		}
	} else if (i915_gem_object_is_lmem(vma->obj)) {
		struct intel_memory_region *mem = vma->obj->mm.region;
		dma_addr_t dma;

		for_each_sgt_daddr(dma, iter, vma->pages) {
			void __iomem *s;

1049 1050 1051
			s = io_mapping_map_wc(&mem->iomap,
					      dma - mem->region.start,
					      PAGE_SIZE);
1052 1053 1054
			ret = compress_page(compress,
					    (void __force *)s, dst,
					    true);
1055
			io_mapping_unmap(s);
1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066
			if (ret)
				break;
		}
	} else {
		struct page *page;

		for_each_sgt_page(page, iter, vma->pages) {
			void *s;

			drm_clflush_pages(&page, 1);

1067
			s = kmap(page);
1068
			ret = compress_page(compress, s, dst, false);
1069
			kunmap(page);
1070 1071 1072 1073 1074 1075

			drm_clflush_pages(&page, 1);

			if (ret)
				break;
		}
1076 1077
	}

1078
	if (ret || compress_flush(compress, dst)) {
1079
		while (dst->page_count--)
1080
			pool_free(&compress->pool, dst->pages[dst->page_count]);
1081 1082 1083
		kfree(dst);
		dst = NULL;
	}
1084
	compress_finish(compress);
1085 1086

	return dst;
1087 1088
}

1089
static void gt_record_fences(struct intel_gt_coredump *gt)
1090
{
1091 1092
	struct i915_ggtt *ggtt = gt->_gt->ggtt;
	struct intel_uncore *uncore = gt->_gt->uncore;
1093 1094
	int i;

1095 1096 1097
	if (INTEL_GEN(uncore->i915) >= 6) {
		for (i = 0; i < ggtt->num_fences; i++)
			gt->fence[i] =
1098 1099
				intel_uncore_read64(uncore,
						    FENCE_REG_GEN6_LO(i));
1100 1101 1102
	} else if (INTEL_GEN(uncore->i915) >= 4) {
		for (i = 0; i < ggtt->num_fences; i++)
			gt->fence[i] =
1103 1104
				intel_uncore_read64(uncore,
						    FENCE_REG_965_LO(i));
1105
	} else {
1106 1107
		for (i = 0; i < ggtt->num_fences; i++)
			gt->fence[i] =
1108
				intel_uncore_read(uncore, FENCE_REG(i));
1109
	}
1110
	gt->nfence = i;
1111 1112
}

1113
static void engine_record_registers(struct intel_engine_coredump *ee)
1114
{
1115 1116
	const struct intel_engine_cs *engine = ee->engine;
	struct drm_i915_private *i915 = engine->i915;
1117

1118
	if (INTEL_GEN(i915) >= 6) {
1119
		ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL);
1120

1121 1122 1123 1124 1125 1126
		if (INTEL_GEN(i915) >= 12)
			ee->fault_reg = intel_uncore_read(engine->uncore,
							  GEN12_RING_FAULT_REG);
		else if (INTEL_GEN(i915) >= 8)
			ee->fault_reg = intel_uncore_read(engine->uncore,
							  GEN8_RING_FAULT_REG);
1127
		else
1128
			ee->fault_reg = GEN6_RING_FAULT_REG_READ(engine);
1129 1130
	}

1131
	if (INTEL_GEN(i915) >= 4) {
1132
		ee->esr = ENGINE_READ(engine, RING_ESR);
1133 1134 1135 1136 1137
		ee->faddr = ENGINE_READ(engine, RING_DMA_FADD);
		ee->ipeir = ENGINE_READ(engine, RING_IPEIR);
		ee->ipehr = ENGINE_READ(engine, RING_IPEHR);
		ee->instps = ENGINE_READ(engine, RING_INSTPS);
		ee->bbaddr = ENGINE_READ(engine, RING_BBADDR);
1138 1139
		ee->ccid = ENGINE_READ(engine, CCID);
		if (INTEL_GEN(i915) >= 8) {
1140 1141
			ee->faddr |= (u64)ENGINE_READ(engine, RING_DMA_FADD_UDW) << 32;
			ee->bbaddr |= (u64)ENGINE_READ(engine, RING_BBADDR_UDW) << 32;
1142
		}
1143
		ee->bbstate = ENGINE_READ(engine, RING_BBSTATE);
1144
	} else {
1145 1146 1147
		ee->faddr = ENGINE_READ(engine, DMA_FADD_I8XX);
		ee->ipeir = ENGINE_READ(engine, IPEIR);
		ee->ipehr = ENGINE_READ(engine, IPEHR);
1148 1149
	}

1150
	intel_engine_get_instdone(engine, &ee->instdone);
1151

1152
	ee->instpm = ENGINE_READ(engine, RING_INSTPM);
1153
	ee->acthd = intel_engine_get_active_head(engine);
1154 1155 1156 1157
	ee->start = ENGINE_READ(engine, RING_START);
	ee->head = ENGINE_READ(engine, RING_HEAD);
	ee->tail = ENGINE_READ(engine, RING_TAIL);
	ee->ctl = ENGINE_READ(engine, RING_CTL);
1158
	if (INTEL_GEN(i915) > 2)
1159
		ee->mode = ENGINE_READ(engine, RING_MI_MODE);
1160

1161
	if (!HWS_NEEDS_PHYSICAL(i915)) {
1162
		i915_reg_t mmio;
1163

1164
		if (IS_GEN(i915, 7)) {
1165
			switch (engine->id) {
1166
			default:
1167
				MISSING_CASE(engine->id);
1168
				fallthrough;
1169
			case RCS0:
1170 1171
				mmio = RENDER_HWS_PGA_GEN7;
				break;
1172
			case BCS0:
1173 1174
				mmio = BLT_HWS_PGA_GEN7;
				break;
1175
			case VCS0:
1176 1177
				mmio = BSD_HWS_PGA_GEN7;
				break;
1178
			case VECS0:
1179 1180 1181
				mmio = VEBOX_HWS_PGA_GEN7;
				break;
			}
1182
		} else if (IS_GEN(engine->i915, 6)) {
1183
			mmio = RING_HWS_PGA_GEN6(engine->mmio_base);
1184 1185
		} else {
			/* XXX: gen8 returns to sanity */
1186
			mmio = RING_HWS_PGA(engine->mmio_base);
1187 1188
		}

1189
		ee->hws = intel_uncore_read(engine->uncore, mmio);
1190 1191
	}

1192
	ee->reset_count = i915_reset_engine_count(&i915->gpu_error, engine);
1193

1194
	if (HAS_PPGTT(i915)) {
1195 1196
		int i;

1197
		ee->vm_info.gfx_mode = ENGINE_READ(engine, RING_MODE_GEN7);
1198

1199
		if (IS_GEN(i915, 6)) {
1200
			ee->vm_info.pp_dir_base =
1201
				ENGINE_READ(engine, RING_PP_DIR_BASE_READ);
1202
		} else if (IS_GEN(i915, 7)) {
1203
			ee->vm_info.pp_dir_base =
1204
				ENGINE_READ(engine, RING_PP_DIR_BASE);
1205
		} else if (INTEL_GEN(i915) >= 8) {
1206 1207
			u32 base = engine->mmio_base;

1208
			for (i = 0; i < 4; i++) {
1209
				ee->vm_info.pdp[i] =
1210 1211
					intel_uncore_read(engine->uncore,
							  GEN8_RING_PDP_UDW(base, i));
1212 1213
				ee->vm_info.pdp[i] <<= 32;
				ee->vm_info.pdp[i] |=
1214 1215
					intel_uncore_read(engine->uncore,
							  GEN8_RING_PDP_LDW(base, i));
1216
			}
1217
		}
1218
	}
1219 1220
}

1221
static void record_request(const struct i915_request *request,
1222
			   struct i915_request_coredump *erq)
1223
{
1224
	erq->flags = request->fence.flags;
1225 1226
	erq->context = request->fence.context;
	erq->seqno = request->fence.seqno;
1227
	erq->sched_attr = request->sched.attr;
1228 1229
	erq->head = request->head;
	erq->tail = request->tail;
1230 1231 1232

	erq->pid = 0;
	rcu_read_lock();
1233 1234 1235 1236 1237 1238 1239
	if (!intel_context_is_closed(request->context)) {
		const struct i915_gem_context *ctx;

		ctx = rcu_dereference(request->context->gem_context);
		if (ctx)
			erq->pid = pid_nr(ctx->pid);
	}
1240
	rcu_read_unlock();
1241 1242
}

1243
static void engine_record_execlists(struct intel_engine_coredump *ee)
1244
{
1245 1246
	const struct intel_engine_execlists * const el = &ee->engine->execlists;
	struct i915_request * const *port = el->active;
1247
	unsigned int n = 0;
1248

1249 1250
	while (*port)
		record_request(*port++, &ee->execlist[n++]);
1251 1252

	ee->num_ports = n;
1253 1254
}

1255
static bool record_context(struct i915_gem_context_coredump *e,
1256
			   const struct i915_request *rq)
1257
{
1258 1259
	struct i915_gem_context *ctx;
	struct task_struct *task;
1260
	bool simulated;
1261 1262 1263 1264 1265 1266

	rcu_read_lock();
	ctx = rcu_dereference(rq->context->gem_context);
	if (ctx && !kref_get_unless_zero(&ctx->ref))
		ctx = NULL;
	rcu_read_unlock();
1267
	if (!ctx)
1268
		return true;
1269

1270 1271 1272 1273 1274
	rcu_read_lock();
	task = pid_task(ctx->pid, PIDTYPE_PID);
	if (task) {
		strcpy(e->comm, task->comm);
		e->pid = task->pid;
1275
	}
1276
	rcu_read_unlock();
1277

1278
	e->sched_attr = ctx->sched;
1279 1280
	e->guilty = atomic_read(&ctx->guilty_count);
	e->active = atomic_read(&ctx->active_count);
1281

1282 1283 1284
	e->total_runtime = rq->context->runtime.total;
	e->avg_runtime = ewma_runtime_read(&rq->context->runtime.avg);

1285
	simulated = i915_gem_context_no_error_capture(ctx);
1286 1287

	i915_gem_context_put(ctx);
1288
	return simulated;
1289 1290
}

1291 1292 1293 1294
struct intel_engine_capture_vma {
	struct intel_engine_capture_vma *next;
	struct i915_vma *vma;
	char name[16];
1295 1296
};

1297 1298
static struct intel_engine_capture_vma *
capture_vma(struct intel_engine_capture_vma *next,
1299
	    struct i915_vma *vma,
1300 1301
	    const char *name,
	    gfp_t gfp)
1302
{
1303
	struct intel_engine_capture_vma *c;
1304 1305 1306 1307

	if (!vma)
		return next;

1308
	c = kmalloc(sizeof(*c), gfp);
1309 1310 1311
	if (!c)
		return next;

1312
	if (!i915_active_acquire_if_busy(&vma->active)) {
1313 1314 1315 1316
		kfree(c);
		return next;
	}

1317
	strcpy(c->name, name);
1318
	c->vma = vma; /* reference held while active */
1319 1320 1321 1322 1323

	c->next = next;
	return c;
}

1324 1325 1326 1327
static struct intel_engine_capture_vma *
capture_user(struct intel_engine_capture_vma *capture,
	     const struct i915_request *rq,
	     gfp_t gfp)
1328
{
1329
	struct i915_capture_list *c;
1330

1331 1332
	for (c = rq->capture_list; c; c = c->next)
		capture = capture_vma(capture, c->vma, "user", gfp);
1333 1334

	return capture;
1335 1336
}

1337 1338
static void add_vma(struct intel_engine_coredump *ee,
		    struct i915_vma_coredump *vma)
1339
{
1340 1341 1342 1343 1344 1345 1346 1347 1348 1349
	if (vma) {
		vma->next = ee->vma;
		ee->vma = vma;
	}
}

struct intel_engine_coredump *
intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp)
{
	struct intel_engine_coredump *ee;
1350

1351
	ee = kzalloc(sizeof(*ee), gfp);
1352
	if (!ee)
1353
		return NULL;
1354

1355
	ee->engine = engine;
1356

1357 1358
	engine_record_registers(ee);
	engine_record_execlists(ee);
1359

1360 1361
	return ee;
}
1362

1363 1364 1365 1366 1367 1368
struct intel_engine_capture_vma *
intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
				  struct i915_request *rq,
				  gfp_t gfp)
{
	struct intel_engine_capture_vma *vma = NULL;
1369

1370 1371 1372
	ee->simulated |= record_context(&ee->context, rq);
	if (ee->simulated)
		return NULL;
1373

1374 1375 1376 1377 1378 1379 1380 1381 1382
	/*
	 * We need to copy these to an anonymous buffer
	 * as the simplest method to avoid being overwritten
	 * by userspace.
	 */
	vma = capture_vma(vma, rq->batch, "batch", gfp);
	vma = capture_user(vma, rq, gfp);
	vma = capture_vma(vma, rq->ring->vma, "ring", gfp);
	vma = capture_vma(vma, rq->context->state, "HW context", gfp);
1383

1384 1385 1386
	ee->rq_head = rq->head;
	ee->rq_post = rq->postfix;
	ee->rq_tail = rq->tail;
1387

1388 1389
	return vma;
}
1390

1391 1392 1393 1394 1395 1396
void
intel_engine_coredump_add_vma(struct intel_engine_coredump *ee,
			      struct intel_engine_capture_vma *capture,
			      struct i915_vma_compress *compress)
{
	const struct intel_engine_cs *engine = ee->engine;
1397

1398 1399 1400
	while (capture) {
		struct intel_engine_capture_vma *this = capture;
		struct i915_vma *vma = this->vma;
1401

1402 1403 1404 1405
		add_vma(ee,
			i915_vma_coredump_create(engine->gt,
						 vma, this->name,
						 compress));
1406

1407
		i915_active_release(&vma->active);
1408

1409 1410 1411
		capture = this->next;
		kfree(this);
	}
1412

1413 1414 1415 1416 1417
	add_vma(ee,
		i915_vma_coredump_create(engine->gt,
					 engine->status_page.vma,
					 "HW Status",
					 compress));
1418

1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429
	add_vma(ee,
		i915_vma_coredump_create(engine->gt,
					 engine->wa_ctx.vma,
					 "WA context",
					 compress));
}

static struct intel_engine_coredump *
capture_engine(struct intel_engine_cs *engine,
	       struct i915_vma_compress *compress)
{
1430
	struct intel_engine_capture_vma *capture = NULL;
1431 1432 1433
	struct intel_engine_coredump *ee;
	struct i915_request *rq;
	unsigned long flags;
1434

1435 1436 1437
	ee = intel_engine_coredump_alloc(engine, GFP_KERNEL);
	if (!ee)
		return NULL;
1438

1439 1440
	spin_lock_irqsave(&engine->active.lock, flags);
	rq = intel_engine_find_active_request(engine);
1441 1442 1443 1444 1445
	if (rq)
		capture = intel_engine_coredump_add_request(ee, rq,
							    ATOMIC_MAYFAIL);
	spin_unlock_irqrestore(&engine->active.lock, flags);
	if (!capture) {
1446 1447 1448
		kfree(ee);
		return NULL;
	}
1449

1450
	intel_engine_coredump_add_vma(ee, capture, compress);
1451

1452
	return ee;
1453 1454
}

1455
static void
1456
gt_record_engines(struct intel_gt_coredump *gt,
1457
		  intel_engine_mask_t engine_mask,
1458
		  struct i915_vma_compress *compress)
1459
{
1460 1461
	struct intel_engine_cs *engine;
	enum intel_engine_id id;
1462

1463 1464 1465 1466 1467 1468 1469 1470 1471 1472
	for_each_engine(engine, gt->_gt, id) {
		struct intel_engine_coredump *ee;

		/* Refill our page pool before entering atomic section */
		pool_refill(&compress->pool, ALLOW_FAIL);

		ee = capture_engine(engine, compress);
		if (!ee)
			continue;

1473 1474
		ee->hung = engine->mask & engine_mask;

1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495
		gt->simulated |= ee->simulated;
		if (ee->simulated) {
			kfree(ee);
			continue;
		}

		ee->next = gt->engine;
		gt->engine = ee;
	}
}

static struct intel_uc_coredump *
gt_record_uc(struct intel_gt_coredump *gt,
	     struct i915_vma_compress *compress)
{
	const struct intel_uc *uc = &gt->_gt->uc;
	struct intel_uc_coredump *error_uc;

	error_uc = kzalloc(sizeof(*error_uc), ALLOW_FAIL);
	if (!error_uc)
		return NULL;
1496

1497 1498
	memcpy(&error_uc->guc_fw, &uc->guc.fw, sizeof(uc->guc.fw));
	memcpy(&error_uc->huc_fw, &uc->huc.fw, sizeof(uc->huc.fw));
1499 1500 1501 1502 1503

	/* Non-default firmware paths will be specified by the modparam.
	 * As modparams are generally accesible from the userspace make
	 * explicit copies of the firmware paths.
	 */
1504 1505
	error_uc->guc_fw.path = kstrdup(uc->guc.fw.path, ALLOW_FAIL);
	error_uc->huc_fw.path = kstrdup(uc->huc.fw.path, ALLOW_FAIL);
1506 1507 1508 1509 1510 1511 1512 1513
	error_uc->guc_log =
		i915_vma_coredump_create(gt->_gt,
					 uc->guc.log.vma, "GuC log buffer",
					 compress);

	return error_uc;
}

1514
/* Capture all registers which don't fit into another category. */
1515
static void gt_record_regs(struct intel_gt_coredump *gt)
1516
{
1517 1518
	struct intel_uncore *uncore = gt->_gt->uncore;
	struct drm_i915_private *i915 = uncore->i915;
1519
	int i;
1520

1521 1522
	/*
	 * General organization
1523 1524 1525 1526 1527 1528
	 * 1. Registers specific to a single generation
	 * 2. Registers which belong to multiple generations
	 * 3. Feature specific registers.
	 * 4. Everything else
	 * Please try to follow the order.
	 */
1529

1530
	/* 1: Registers specific to a single generation */
1531
	if (IS_VALLEYVIEW(i915)) {
1532 1533 1534
		gt->gtier[0] = intel_uncore_read(uncore, GTIER);
		gt->ier = intel_uncore_read(uncore, VLV_IER);
		gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV);
1535
	}
1536

1537
	if (IS_GEN(i915, 7))
1538
		gt->err_int = intel_uncore_read(uncore, GEN7_ERR_INT);
1539

1540
	if (INTEL_GEN(i915) >= 12) {
1541 1542 1543 1544
		gt->fault_data0 = intel_uncore_read(uncore,
						    GEN12_FAULT_TLB_DATA0);
		gt->fault_data1 = intel_uncore_read(uncore,
						    GEN12_FAULT_TLB_DATA1);
1545
	} else if (INTEL_GEN(i915) >= 8) {
1546 1547 1548 1549
		gt->fault_data0 = intel_uncore_read(uncore,
						    GEN8_FAULT_TLB_DATA0);
		gt->fault_data1 = intel_uncore_read(uncore,
						    GEN8_FAULT_TLB_DATA1);
1550 1551
	}

1552
	if (IS_GEN(i915, 6)) {
1553 1554 1555
		gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE);
		gt->gab_ctl = intel_uncore_read(uncore, GAB_CTL);
		gt->gfx_mode = intel_uncore_read(uncore, GFX_MODE);
1556
	}
1557

1558
	/* 2: Registers which belong to multiple generations */
1559
	if (INTEL_GEN(i915) >= 7)
1560
		gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT);
1561

1562
	if (INTEL_GEN(i915) >= 6) {
1563
		gt->derrmr = intel_uncore_read(uncore, DERRMR);
1564
		if (INTEL_GEN(i915) < 12) {
1565 1566
			gt->error = intel_uncore_read(uncore, ERROR_GEN6);
			gt->done_reg = intel_uncore_read(uncore, DONE_REG);
1567
		}
1568 1569
	}

1570
	/* 3: Feature specific registers */
1571
	if (IS_GEN_RANGE(i915, 6, 7)) {
1572 1573
		gt->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK);
		gt->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS);
1574 1575
	}

1576
	if (IS_GEN_RANGE(i915, 8, 11))
1577
		gt->gtt_cache = intel_uncore_read(uncore, HSW_GTT_CACHE_EN);
1578

1579
	if (IS_GEN(i915, 12))
1580
		gt->aux_err = intel_uncore_read(uncore, GEN12_AUX_ERR_DBG);
1581

1582 1583
	if (INTEL_GEN(i915) >= 12) {
		for (i = 0; i < GEN12_SFC_DONE_MAX; i++) {
1584
			gt->sfc_done[i] =
1585 1586
				intel_uncore_read(uncore, GEN12_SFC_DONE(i));
		}
M
Mika Kuoppala 已提交
1587

1588
		gt->gam_done = intel_uncore_read(uncore, GEN12_GAM_DONE);
1589 1590
	}

1591
	/* 4: Everything else */
1592
	if (INTEL_GEN(i915) >= 11) {
1593 1594
		gt->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER);
		gt->gtier[0] =
1595 1596
			intel_uncore_read(uncore,
					  GEN11_RENDER_COPY_INTR_ENABLE);
1597
		gt->gtier[1] =
1598
			intel_uncore_read(uncore, GEN11_VCS_VECS_INTR_ENABLE);
1599
		gt->gtier[2] =
1600
			intel_uncore_read(uncore, GEN11_GUC_SG_INTR_ENABLE);
1601
		gt->gtier[3] =
1602 1603
			intel_uncore_read(uncore,
					  GEN11_GPM_WGBOXPERF_INTR_ENABLE);
1604
		gt->gtier[4] =
1605 1606
			intel_uncore_read(uncore,
					  GEN11_CRYPTO_RSVD_INTR_ENABLE);
1607
		gt->gtier[5] =
1608 1609
			intel_uncore_read(uncore,
					  GEN11_GUNIT_CSME_INTR_ENABLE);
1610
		gt->ngtier = 6;
1611
	} else if (INTEL_GEN(i915) >= 8) {
1612
		gt->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER);
1613
		for (i = 0; i < 4; i++)
1614 1615 1616
			gt->gtier[i] =
				intel_uncore_read(uncore, GEN8_GT_IER(i));
		gt->ngtier = 4;
1617
	} else if (HAS_PCH_SPLIT(i915)) {
1618 1619 1620
		gt->ier = intel_uncore_read(uncore, DEIER);
		gt->gtier[0] = intel_uncore_read(uncore, GTIER);
		gt->ngtier = 1;
1621
	} else if (IS_GEN(i915, 2)) {
1622
		gt->ier = intel_uncore_read16(uncore, GEN2_IER);
1623
	} else if (!IS_VALLEYVIEW(i915)) {
1624
		gt->ier = intel_uncore_read(uncore, GEN2_IER);
1625
	}
1626 1627 1628 1629
	gt->eir = intel_uncore_read(uncore, EIR);
	gt->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER);
}

1630 1631 1632 1633 1634
static void gt_record_info(struct intel_gt_coredump *gt)
{
	memcpy(&gt->info, &gt->_gt->info, sizeof(struct intel_gt_info));
}

1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653
/*
 * Generate a semi-unique error code. The code is not meant to have meaning, The
 * code's only purpose is to try to prevent false duplicated bug reports by
 * grossly estimating a GPU error state.
 *
 * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
 * the hang if we could strip the GTT offset information from it.
 *
 * It's only a small step better than a random number in its current form.
 */
static u32 generate_ecode(const struct intel_engine_coredump *ee)
{
	/*
	 * IPEHR would be an ideal way to detect errors, as it's the gross
	 * measure of "the command that hung." However, has some very common
	 * synchronization commands which almost always appear in the case
	 * strictly a client bug. Use instdone to differentiate those some.
	 */
	return ee ? ee->ipehr ^ ee->instdone.instdone : 0;
1654 1655
}

1656
static const char *error_msg(struct i915_gpu_coredump *error)
1657
{
1658
	struct intel_engine_coredump *first = NULL;
1659
	unsigned int hung_classes = 0;
1660
	struct intel_gt_coredump *gt;
1661
	int len;
1662

1663 1664 1665
	for (gt = error->gt; gt; gt = gt->next) {
		struct intel_engine_coredump *cs;

1666 1667
		for (cs = gt->engine; cs; cs = cs->next) {
			if (cs->hung) {
1668
				hung_classes |= BIT(cs->engine->uabi_class);
1669 1670 1671 1672
				if (!first)
					first = cs;
			}
		}
1673 1674
	}

1675
	len = scnprintf(error->error_msg, sizeof(error->error_msg),
1676
			"GPU HANG: ecode %d:%x:%08x",
1677
			INTEL_GEN(error->i915), hung_classes,
1678
			generate_ecode(first));
1679
	if (first && first->context.pid) {
1680
		/* Just show the first executing process, more is confusing */
1681 1682 1683
		len += scnprintf(error->error_msg + len,
				 sizeof(error->error_msg) - len,
				 ", in %s [%d]",
1684
				 first->context.comm, first->context.pid);
1685
	}
1686

1687
	return error->error_msg;
1688 1689
}

1690
static void capture_gen(struct i915_gpu_coredump *error)
1691
{
1692 1693 1694 1695
	struct drm_i915_private *i915 = error->i915;

	error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count);
	error->suspended = i915->runtime_pm.suspended;
1696

1697 1698 1699 1700
	error->iommu = -1;
#ifdef CONFIG_INTEL_IOMMU
	error->iommu = intel_iommu_gfx_mapped;
#endif
1701 1702
	error->reset_count = i915_reset_count(&i915->gpu_error);
	error->suspend_count = i915->suspend_count;
1703

1704
	i915_params_copy(&error->params, &i915->params);
1705
	memcpy(&error->device_info,
1706
	       INTEL_INFO(i915),
1707
	       sizeof(error->device_info));
1708 1709 1710
	memcpy(&error->runtime_info,
	       RUNTIME_INFO(i915),
	       sizeof(error->runtime_info));
1711
	error->driver_caps = i915->caps;
1712 1713
}

1714 1715
struct i915_gpu_coredump *
i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp)
1716
{
1717 1718
	struct i915_gpu_coredump *error;

1719
	if (!i915->params.error_capture)
1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736
		return NULL;

	error = kzalloc(sizeof(*error), gfp);
	if (!error)
		return NULL;

	kref_init(&error->ref);
	error->i915 = i915;

	error->time = ktime_get_real();
	error->boottime = ktime_get_boottime();
	error->uptime = ktime_sub(ktime_get(), i915->gt.last_init_time);
	error->capture = jiffies;

	capture_gen(error);

	return error;
1737 1738
}

1739 1740 1741 1742
#define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))

struct intel_gt_coredump *
intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp)
1743
{
1744
	struct intel_gt_coredump *gc;
1745

1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757
	gc = kzalloc(sizeof(*gc), gfp);
	if (!gc)
		return NULL;

	gc->_gt = gt;
	gc->awake = intel_gt_pm_is_awake(gt);

	gt_record_regs(gc);
	gt_record_fences(gc);

	return gc;
}
1758

1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770
struct i915_vma_compress *
i915_vma_capture_prepare(struct intel_gt_coredump *gt)
{
	struct i915_vma_compress *compress;

	compress = kmalloc(sizeof(*compress), ALLOW_FAIL);
	if (!compress)
		return NULL;

	if (!compress_init(compress)) {
		kfree(compress);
		return NULL;
1771
	}
1772 1773

	return compress;
1774 1775
}

1776 1777 1778 1779 1780
void i915_vma_capture_finish(struct intel_gt_coredump *gt,
			     struct i915_vma_compress *compress)
{
	if (!compress)
		return;
1781

1782 1783 1784 1785
	compress_fini(compress);
	kfree(compress);
}

1786 1787
struct i915_gpu_coredump *
i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask)
1788
{
1789
	struct drm_i915_private *i915 = gt->i915;
1790
	struct i915_gpu_coredump *error;
1791

1792 1793 1794 1795 1796
	/* Check if GPU capture has been disabled */
	error = READ_ONCE(i915->gpu_error.first_error);
	if (IS_ERR(error))
		return error;

1797 1798
	error = i915_gpu_coredump_alloc(i915, ALLOW_FAIL);
	if (!error)
1799
		return ERR_PTR(-ENOMEM);
1800

1801
	error->gt = intel_gt_coredump_alloc(gt, ALLOW_FAIL);
1802 1803
	if (error->gt) {
		struct i915_vma_compress *compress;
1804

1805 1806 1807 1808 1809 1810
		compress = i915_vma_capture_prepare(error->gt);
		if (!compress) {
			kfree(error->gt);
			kfree(error);
			return ERR_PTR(-ENOMEM);
		}
1811

1812
		gt_record_info(error->gt);
1813
		gt_record_engines(error->gt, engine_mask, compress);
1814 1815 1816

		if (INTEL_INFO(i915)->has_gt_uc)
			error->gt->uc = gt_record_uc(error->gt, compress);
1817

1818 1819 1820 1821
		i915_vma_capture_finish(error->gt, compress);

		error->simulated |= error->gt->simulated;
	}
1822 1823 1824

	error->overlay = intel_overlay_capture_error_state(i915);

1825 1826 1827
	return error;
}

1828
void i915_error_state_store(struct i915_gpu_coredump *error)
1829
{
1830
	struct drm_i915_private *i915;
1831
	static bool warned;
1832

1833
	if (IS_ERR_OR_NULL(error))
1834 1835
		return;

1836
	i915 = error->i915;
1837
	drm_info(&i915->drm, "%s\n", error_msg(error));
1838

1839 1840
	if (error->simulated ||
	    cmpxchg(&i915->gpu_error.first_error, NULL, error))
1841 1842
		return;

1843
	i915_gpu_coredump_get(error);
1844

1845
	if (!xchg(&warned, true) &&
1846
	    ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) {
1847
		pr_info("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
1848 1849
		pr_info("Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/intel/issues/new.\n");
		pr_info("Please see https://gitlab.freedesktop.org/drm/intel/-/wikis/How-to-file-i915-bugs for details.\n");
1850 1851 1852 1853
		pr_info("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
		pr_info("The GPU crash dump is required to analyze GPU hangs, so please always attach it.\n");
		pr_info("GPU crash dump saved to /sys/class/drm/card%d/error\n",
			i915->drm.primary->index);
1854
	}
1855 1856
}

1857 1858
/**
 * i915_capture_error_state - capture an error record for later analysis
1859 1860 1861
 * @gt: intel_gt which originated the hang
 * @engine_mask: hung engines
 *
1862 1863 1864 1865 1866 1867
 *
 * Should be called when an error is detected (either a hang or an error
 * interrupt) to capture error state from the time of the error.  Fills
 * out a structure which becomes available in debugfs for user level tools
 * to pick up.
 */
1868 1869
void i915_capture_error_state(struct intel_gt *gt,
			      intel_engine_mask_t engine_mask)
1870 1871 1872
{
	struct i915_gpu_coredump *error;

1873
	error = i915_gpu_coredump(gt, engine_mask);
1874
	if (IS_ERR(error)) {
1875
		cmpxchg(&gt->i915->gpu_error.first_error, NULL, error);
1876 1877 1878 1879 1880 1881 1882 1883
		return;
	}

	i915_error_state_store(error);
	i915_gpu_coredump_put(error);
}

struct i915_gpu_coredump *
1884
i915_first_error_state(struct drm_i915_private *i915)
1885
{
1886
	struct i915_gpu_coredump *error;
1887

1888 1889
	spin_lock_irq(&i915->gpu_error.lock);
	error = i915->gpu_error.first_error;
1890
	if (!IS_ERR_OR_NULL(error))
1891
		i915_gpu_coredump_get(error);
1892
	spin_unlock_irq(&i915->gpu_error.lock);
1893

1894
	return error;
1895 1896
}

1897
void i915_reset_error_state(struct drm_i915_private *i915)
1898
{
1899
	struct i915_gpu_coredump *error;
1900

1901 1902
	spin_lock_irq(&i915->gpu_error.lock);
	error = i915->gpu_error.first_error;
1903 1904
	if (error != ERR_PTR(-ENODEV)) /* if disabled, always disabled */
		i915->gpu_error.first_error = NULL;
1905
	spin_unlock_irq(&i915->gpu_error.lock);
1906

1907
	if (!IS_ERR_OR_NULL(error))
1908
		i915_gpu_coredump_put(error);
1909 1910 1911 1912 1913 1914 1915 1916
}

void i915_disable_error_state(struct drm_i915_private *i915, int err)
{
	spin_lock_irq(&i915->gpu_error.lock);
	if (!i915->gpu_error.first_error)
		i915->gpu_error.first_error = ERR_PTR(err);
	spin_unlock_irq(&i915->gpu_error.lock);
1917
}