i915_gpu_error.c 51.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
/*
 * Copyright (c) 2008 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 *
 * Authors:
 *    Eric Anholt <eric@anholt.net>
 *    Keith Packard <keithp@keithp.com>
 *    Mika Kuoppala <mika.kuoppala@intel.com>
 *
 */

C
Chris Wilson 已提交
30 31 32
#include <linux/ascii85.h>
#include <linux/nmi.h>
#include <linux/scatterlist.h>
33
#include <linux/stop_machine.h>
C
Chris Wilson 已提交
34
#include <linux/utsname.h>
35
#include <linux/zlib.h>
C
Chris Wilson 已提交
36

37 38
#include <drm/drm_print.h>

39
#include "i915_gpu_error.h"
40 41
#include "i915_drv.h"

42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
static inline const struct intel_engine_cs *
engine_lookup(const struct drm_i915_private *i915, unsigned int id)
{
	if (id >= I915_NUM_ENGINES)
		return NULL;

	return i915->engine[id];
}

static inline const char *
__engine_name(const struct intel_engine_cs *engine)
{
	return engine ? engine->name : "";
}

static const char *
engine_name(const struct drm_i915_private *i915, unsigned int id)
{
	return __engine_name(engine_lookup(i915, id));
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
}

static const char *tiling_flag(int tiling)
{
	switch (tiling) {
	default:
	case I915_TILING_NONE: return "";
	case I915_TILING_X: return " X";
	case I915_TILING_Y: return " Y";
	}
}

static const char *dirty_flag(int dirty)
{
	return dirty ? " dirty" : "";
}

static const char *purgeable_flag(int purgeable)
{
	return purgeable ? " purgeable" : "";
}

C
Chris Wilson 已提交
83 84
static void __sg_set_buf(struct scatterlist *sg,
			 void *addr, unsigned int len, loff_t it)
85
{
C
Chris Wilson 已提交
86 87 88 89
	sg->page_link = (unsigned long)virt_to_page(addr);
	sg->offset = offset_in_page(addr);
	sg->length = len;
	sg->dma_address = it;
90 91
}

C
Chris Wilson 已提交
92
static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
93
{
C
Chris Wilson 已提交
94
	if (!len)
95 96
		return false;

C
Chris Wilson 已提交
97 98 99 100 101 102 103 104
	if (e->bytes + len + 1 <= e->size)
		return true;

	if (e->bytes) {
		__sg_set_buf(e->cur++, e->buf, e->bytes, e->iter);
		e->iter += e->bytes;
		e->buf = NULL;
		e->bytes = 0;
105 106
	}

C
Chris Wilson 已提交
107 108
	if (e->cur == e->end) {
		struct scatterlist *sgl;
109

C
Chris Wilson 已提交
110 111 112 113 114
		sgl = (typeof(sgl))__get_free_page(GFP_KERNEL);
		if (!sgl) {
			e->err = -ENOMEM;
			return false;
		}
115

C
Chris Wilson 已提交
116 117 118 119 120 121 122
		if (e->cur) {
			e->cur->offset = 0;
			e->cur->length = 0;
			e->cur->page_link =
				(unsigned long)sgl | SG_CHAIN;
		} else {
			e->sgl = sgl;
123 124
		}

C
Chris Wilson 已提交
125 126
		e->cur = sgl;
		e->end = sgl + SG_MAX_SINGLE_ALLOC - 1;
127 128
	}

C
Chris Wilson 已提交
129 130 131 132 133 134 135 136 137 138 139 140
	e->size = ALIGN(len + 1, SZ_64K);
	e->buf = kmalloc(e->size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
	if (!e->buf) {
		e->size = PAGE_ALIGN(len + 1);
		e->buf = kmalloc(e->size, GFP_KERNEL);
	}
	if (!e->buf) {
		e->err = -ENOMEM;
		return false;
	}

	return true;
141 142
}

143
__printf(2, 0)
144
static void i915_error_vprintf(struct drm_i915_error_state_buf *e,
C
Chris Wilson 已提交
145
			       const char *fmt, va_list args)
146
{
C
Chris Wilson 已提交
147 148
	va_list ap;
	int len;
149

C
Chris Wilson 已提交
150
	if (e->err)
151 152
		return;

C
Chris Wilson 已提交
153 154 155 156 157 158
	va_copy(ap, args);
	len = vsnprintf(NULL, 0, fmt, ap);
	va_end(ap);
	if (len <= 0) {
		e->err = len;
		return;
159 160
	}

C
Chris Wilson 已提交
161 162
	if (!__i915_error_grow(e, len))
		return;
163

C
Chris Wilson 已提交
164 165 166 167 168 169 170
	GEM_BUG_ON(e->bytes >= e->size);
	len = vscnprintf(e->buf + e->bytes, e->size - e->bytes, fmt, args);
	if (len < 0) {
		e->err = len;
		return;
	}
	e->bytes += len;
171 172
}

C
Chris Wilson 已提交
173
static void i915_error_puts(struct drm_i915_error_state_buf *e, const char *str)
174 175 176
{
	unsigned len;

C
Chris Wilson 已提交
177
	if (e->err || !str)
178 179 180
		return;

	len = strlen(str);
C
Chris Wilson 已提交
181 182
	if (!__i915_error_grow(e, len))
		return;
183

C
Chris Wilson 已提交
184
	GEM_BUG_ON(e->bytes + len > e->size);
185
	memcpy(e->buf + e->bytes, str, len);
C
Chris Wilson 已提交
186
	e->bytes += len;
187 188 189 190 191
}

#define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__)
#define err_puts(e, s) i915_error_puts(e, s)

192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf)
{
	i915_error_vprintf(p->arg, vaf->fmt, *vaf->va);
}

static inline struct drm_printer
i915_error_printer(struct drm_i915_error_state_buf *e)
{
	struct drm_printer p = {
		.printfn = __i915_printfn_error,
		.arg = e,
	};
	return p;
}

207 208
#ifdef CONFIG_DRM_I915_COMPRESS_ERROR

209 210 211 212 213 214
struct compress {
	struct z_stream_s zstream;
	void *tmp;
};

static bool compress_init(struct compress *c)
215
{
216
	struct z_stream_s *zstream = memset(&c->zstream, 0, sizeof(c->zstream));
217 218 219 220 221 222 223 224 225 226 227 228

	zstream->workspace =
		kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
			GFP_ATOMIC | __GFP_NOWARN);
	if (!zstream->workspace)
		return false;

	if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) {
		kfree(zstream->workspace);
		return false;
	}

229
	c->tmp = NULL;
230
	if (i915_has_memcpy_from_wc())
231 232
		c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN);

233 234 235
	return true;
}

236 237 238 239 240 241 242 243 244 245 246 247 248 249
static void *compress_next_page(struct drm_i915_error_object *dst)
{
	unsigned long page;

	if (dst->page_count >= dst->num_pages)
		return ERR_PTR(-ENOSPC);

	page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
	if (!page)
		return ERR_PTR(-ENOMEM);

	return dst->pages[dst->page_count++] = (void *)page;
}

250
static int compress_page(struct compress *c,
251 252 253
			 void *src,
			 struct drm_i915_error_object *dst)
{
254 255
	struct z_stream_s *zstream = &c->zstream;

256
	zstream->next_in = src;
257 258
	if (c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
		zstream->next_in = c->tmp;
259 260 261 262
	zstream->avail_in = PAGE_SIZE;

	do {
		if (zstream->avail_out == 0) {
263 264 265
			zstream->next_out = compress_next_page(dst);
			if (IS_ERR(zstream->next_out))
				return PTR_ERR(zstream->next_out);
266 267 268 269

			zstream->avail_out = PAGE_SIZE;
		}

270
		if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK)
271
			return -EIO;
C
Chris Wilson 已提交
272 273

		touch_nmi_watchdog();
274 275 276 277 278 279 280 281 282
	} while (zstream->avail_in);

	/* Fallback to uncompressed if we increase size? */
	if (0 && zstream->total_out > zstream->total_in)
		return -E2BIG;

	return 0;
}

283
static int compress_flush(struct compress *c,
284 285
			  struct drm_i915_error_object *dst)
{
286 287
	struct z_stream_s *zstream = &c->zstream;

288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315
	do {
		switch (zlib_deflate(zstream, Z_FINISH)) {
		case Z_OK: /* more space requested */
			zstream->next_out = compress_next_page(dst);
			if (IS_ERR(zstream->next_out))
				return PTR_ERR(zstream->next_out);

			zstream->avail_out = PAGE_SIZE;
			break;

		case Z_STREAM_END:
			goto end;

		default: /* any error */
			return -EIO;
		}
	} while (1);

end:
	memset(zstream->next_out, 0, zstream->avail_out);
	dst->unused = zstream->avail_out;
	return 0;
}

static void compress_fini(struct compress *c,
			  struct drm_i915_error_object *dst)
{
	struct z_stream_s *zstream = &c->zstream;
316 317 318

	zlib_deflateEnd(zstream);
	kfree(zstream->workspace);
319 320
	if (c->tmp)
		free_page((unsigned long)c->tmp);
321 322 323 324 325 326 327 328 329
}

static void err_compression_marker(struct drm_i915_error_state_buf *m)
{
	err_puts(m, ":");
}

#else

330 331 332 333
struct compress {
};

static bool compress_init(struct compress *c)
334 335 336 337
{
	return true;
}

338
static int compress_page(struct compress *c,
339 340 341 342
			 void *src,
			 struct drm_i915_error_object *dst)
{
	unsigned long page;
343
	void *ptr;
344 345 346 347 348

	page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
	if (!page)
		return -ENOMEM;

349 350 351 352
	ptr = (void *)page;
	if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE))
		memcpy(ptr, src, PAGE_SIZE);
	dst->pages[dst->page_count++] = ptr;
353 354 355 356

	return 0;
}

357 358 359 360 361 362
static int compress_flush(struct compress *c,
			  struct drm_i915_error_object *dst)
{
	return 0;
}

363
static void compress_fini(struct compress *c,
364 365 366 367 368 369 370 371 372 373 374
			  struct drm_i915_error_object *dst)
{
}

static void err_compression_marker(struct drm_i915_error_state_buf *m)
{
	err_puts(m, "~");
}

#endif

375 376 377 378 379
static void print_error_buffers(struct drm_i915_error_state_buf *m,
				const char *name,
				struct drm_i915_error_buffer *err,
				int count)
{
380
	err_printf(m, "%s [%d]:\n", name, count);
381 382

	while (count--) {
383
		err_printf(m, "    %08x_%08x %8u %02x %02x %02x",
384 385
			   upper_32_bits(err->gtt_offset),
			   lower_32_bits(err->gtt_offset),
386 387
			   err->size,
			   err->read_domains,
388 389
			   err->write_domain,
			   err->wseqno);
390 391 392
		err_puts(m, tiling_flag(err->tiling));
		err_puts(m, dirty_flag(err->dirty));
		err_puts(m, purgeable_flag(err->purgeable));
393
		err_puts(m, err->userptr ? " userptr" : "");
394
		err_puts(m, err->engine != -1 ? " " : "");
395
		err_puts(m, engine_name(m->i915, err->engine));
396
		err_puts(m, i915_cache_level_str(m->i915, err->cache_level));
397 398 399 400 401 402 403 404 405 406 407

		if (err->name)
			err_printf(m, " (name: %d)", err->name);
		if (err->fence_reg != I915_FENCE_REG_NONE)
			err_printf(m, " (fence: %d)", err->fence_reg);

		err_puts(m, "\n");
		err++;
	}
}

408
static void error_print_instdone(struct drm_i915_error_state_buf *m,
409
				 const struct drm_i915_error_engine *ee)
410
{
411 412 413
	int slice;
	int subslice;

414 415 416 417 418 419 420 421 422 423 424 425
	err_printf(m, "  INSTDONE: 0x%08x\n",
		   ee->instdone.instdone);

	if (ee->engine_id != RCS || INTEL_GEN(m->i915) <= 3)
		return;

	err_printf(m, "  SC_INSTDONE: 0x%08x\n",
		   ee->instdone.slice_common);

	if (INTEL_GEN(m->i915) <= 6)
		return;

426 427 428 429 430 431 432 433 434
	for_each_instdone_slice_subslice(m->i915, slice, subslice)
		err_printf(m, "  SAMPLER_INSTDONE[%d][%d]: 0x%08x\n",
			   slice, subslice,
			   ee->instdone.sampler[slice][subslice]);

	for_each_instdone_slice_subslice(m->i915, slice, subslice)
		err_printf(m, "  ROW_INSTDONE[%d][%d]: 0x%08x\n",
			   slice, subslice,
			   ee->instdone.row[slice][subslice]);
435 436
}

437 438 439 440 441
static const char *bannable(const struct drm_i915_error_context *ctx)
{
	return ctx->bannable ? "" : " (unbannable)";
}

442 443
static void error_print_request(struct drm_i915_error_state_buf *m,
				const char *prefix,
444 445
				const struct drm_i915_error_request *erq,
				const unsigned long epoch)
446 447 448 449
{
	if (!erq->seqno)
		return;

450
	err_printf(m, "%s pid %d, ban score %d, seqno %8x:%08x, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
451
		   prefix, erq->pid, erq->ban_score,
452
		   erq->context, erq->seqno, erq->sched_attr.priority,
453
		   jiffies_to_msecs(erq->jiffies - epoch),
454
		   erq->start, erq->head, erq->tail);
455 456
}

457 458
static void error_print_context(struct drm_i915_error_state_buf *m,
				const char *header,
459
				const struct drm_i915_error_context *ctx)
460
{
461
	err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score %d%s guilty %d active %d\n",
462
		   header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
463
		   ctx->sched_attr.priority, ctx->ban_score, bannable(ctx),
464
		   ctx->guilty, ctx->active);
465 466
}

467
static void error_print_engine(struct drm_i915_error_state_buf *m,
468 469
			       const struct drm_i915_error_engine *ee,
			       const unsigned long epoch)
470
{
471 472
	int n;

473 474
	err_printf(m, "%s command stream:\n",
		   engine_name(m->i915, ee->engine_id));
475
	err_printf(m, "  IDLE?: %s\n", yesno(ee->idle));
476
	err_printf(m, "  START: 0x%08x\n", ee->start);
477
	err_printf(m, "  HEAD:  0x%08x [0x%08x]\n", ee->head, ee->rq_head);
478 479
	err_printf(m, "  TAIL:  0x%08x [0x%08x, 0x%08x]\n",
		   ee->tail, ee->rq_post, ee->rq_tail);
480
	err_printf(m, "  CTL:   0x%08x\n", ee->ctl);
481
	err_printf(m, "  MODE:  0x%08x\n", ee->mode);
482 483 484 485 486
	err_printf(m, "  HWS:   0x%08x\n", ee->hws);
	err_printf(m, "  ACTHD: 0x%08x %08x\n",
		   (u32)(ee->acthd>>32), (u32)ee->acthd);
	err_printf(m, "  IPEIR: 0x%08x\n", ee->ipeir);
	err_printf(m, "  IPEHR: 0x%08x\n", ee->ipehr);
487 488 489

	error_print_instdone(m, ee);

490 491 492 493 494 495 496 497
	if (ee->batchbuffer) {
		u64 start = ee->batchbuffer->gtt_offset;
		u64 end = start + ee->batchbuffer->gtt_size;

		err_printf(m, "  batch: [0x%08x_%08x, 0x%08x_%08x]\n",
			   upper_32_bits(start), lower_32_bits(start),
			   upper_32_bits(end), lower_32_bits(end));
	}
498
	if (INTEL_GEN(m->i915) >= 4) {
499
		err_printf(m, "  BBADDR: 0x%08x_%08x\n",
500 501 502
			   (u32)(ee->bbaddr>>32), (u32)ee->bbaddr);
		err_printf(m, "  BB_STATE: 0x%08x\n", ee->bbstate);
		err_printf(m, "  INSTPS: 0x%08x\n", ee->instps);
503
	}
504 505 506 507 508 509
	err_printf(m, "  INSTPM: 0x%08x\n", ee->instpm);
	err_printf(m, "  FADDR: 0x%08x %08x\n", upper_32_bits(ee->faddr),
		   lower_32_bits(ee->faddr));
	if (INTEL_GEN(m->i915) >= 6) {
		err_printf(m, "  RC PSMI: 0x%08x\n", ee->rc_psmi);
		err_printf(m, "  FAULT_REG: 0x%08x\n", ee->fault_reg);
510 511 512 513 514 515 516
		err_printf(m, "  SYNC_0: 0x%08x\n",
			   ee->semaphore_mboxes[0]);
		err_printf(m, "  SYNC_1: 0x%08x\n",
			   ee->semaphore_mboxes[1]);
		if (HAS_VEBOX(m->i915))
			err_printf(m, "  SYNC_2: 0x%08x\n",
				   ee->semaphore_mboxes[2]);
517
	}
518
	if (HAS_PPGTT(m->i915)) {
519
		err_printf(m, "  GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode);
520

521
		if (INTEL_GEN(m->i915) >= 8) {
522 523 524
			int i;
			for (i = 0; i < 4; i++)
				err_printf(m, "  PDP%d: 0x%016llx\n",
525
					   i, ee->vm_info.pdp[i]);
526 527
		} else {
			err_printf(m, "  PP_DIR_BASE: 0x%08x\n",
528
				   ee->vm_info.pp_dir_base);
529 530
		}
	}
531 532 533 534 535
	err_printf(m, "  seqno: 0x%08x\n", ee->seqno);
	err_printf(m, "  last_seqno: 0x%08x\n", ee->last_seqno);
	err_printf(m, "  waiting: %s\n", yesno(ee->waiting));
	err_printf(m, "  ring->head: 0x%08x\n", ee->cpu_ring_head);
	err_printf(m, "  ring->tail: 0x%08x\n", ee->cpu_ring_tail);
536 537 538
	err_printf(m, "  hangcheck stall: %s\n", yesno(ee->hangcheck_stalled));
	err_printf(m, "  hangcheck action: %s\n",
		   hangcheck_action_to_str(ee->hangcheck_action));
539 540
	err_printf(m, "  hangcheck action timestamp: %dms (%lu%s)\n",
		   jiffies_to_msecs(ee->hangcheck_timestamp - epoch),
541
		   ee->hangcheck_timestamp,
542
		   ee->hangcheck_timestamp == epoch ? "; epoch" : "");
543
	err_printf(m, "  engine reset count: %u\n", ee->reset_count);
544

545 546
	for (n = 0; n < ee->num_ports; n++) {
		err_printf(m, "  ELSP[%d]:", n);
547
		error_print_request(m, " ", &ee->execlist[n], epoch);
548 549
	}

550
	error_print_context(m, "  Active context: ", &ee->context);
551 552 553 554 555 556 557 558 559 560 561
}

void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
{
	va_list args;

	va_start(args, f);
	i915_error_vprintf(e, f, args);
	va_end(args);
}

562
static void print_error_obj(struct drm_i915_error_state_buf *m,
563 564
			    struct intel_engine_cs *engine,
			    const char *name,
565 566
			    struct drm_i915_error_object *obj)
{
567
	char out[ASCII85_BUFSZ];
568
	int page;
569

570 571 572 573 574 575 576 577 578 579
	if (!obj)
		return;

	if (name) {
		err_printf(m, "%s --- %s = 0x%08x %08x\n",
			   engine ? engine->name : "global", name,
			   upper_32_bits(obj->gtt_offset),
			   lower_32_bits(obj->gtt_offset));
	}

580 581 582 583 584 585 586 587 588
	err_compression_marker(m);
	for (page = 0; page < obj->page_count; page++) {
		int i, len;

		len = PAGE_SIZE;
		if (page == obj->page_count - 1)
			len -= obj->unused;
		len = ascii85_encode_len(len);

589 590
		for (i = 0; i < len; i++)
			err_puts(m, ascii85_encode(obj->pages[page][i], out));
591
	}
592
	err_puts(m, "\n");
593 594
}

595
static void err_print_capabilities(struct drm_i915_error_state_buf *m,
596
				   const struct intel_device_info *info,
597
				   const struct intel_runtime_info *runtime,
598
				   const struct intel_driver_caps *caps)
599
{
600 601 602
	struct drm_printer p = i915_error_printer(m);

	intel_device_info_dump_flags(info, &p);
603
	intel_driver_caps_print(caps, &p);
604
	intel_device_info_dump_topology(&runtime->sseu, &p);
605 606
}

607
static void err_print_params(struct drm_i915_error_state_buf *m,
608
			     const struct i915_params *params)
609
{
610 611 612
	struct drm_printer p = i915_error_printer(m);

	i915_params_dump(params, &p);
613 614
}

615 616 617 618 619 620 621 622 623 624 625 626
static void err_print_pciid(struct drm_i915_error_state_buf *m,
			    struct drm_i915_private *i915)
{
	struct pci_dev *pdev = i915->drm.pdev;

	err_printf(m, "PCI ID: 0x%04x\n", pdev->device);
	err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision);
	err_printf(m, "PCI Subsystem: %04x:%04x\n",
		   pdev->subsystem_vendor,
		   pdev->subsystem_device);
}

627 628 629 630 631 632 633 634 635 636 637 638
static void err_print_uc(struct drm_i915_error_state_buf *m,
			 const struct i915_error_uc *error_uc)
{
	struct drm_printer p = i915_error_printer(m);
	const struct i915_gpu_state *error =
		container_of(error_uc, typeof(*error), uc);

	if (!error->device_info.has_guc)
		return;

	intel_uc_fw_dump(&error_uc->guc_fw, &p);
	intel_uc_fw_dump(&error_uc->huc_fw, &p);
639
	print_error_obj(m, NULL, "GuC log buffer", error_uc->guc_log);
640 641
}

C
Chris Wilson 已提交
642
static void err_free_sgl(struct scatterlist *sgl)
643
{
C
Chris Wilson 已提交
644 645
	while (sgl) {
		struct scatterlist *sg;
646

C
Chris Wilson 已提交
647 648 649 650 651 652 653 654 655
		for (sg = sgl; !sg_is_chain(sg); sg++) {
			kfree(sg_virt(sg));
			if (sg_is_last(sg))
				break;
		}

		sg = sg_is_last(sg) ? NULL : sg_chain_ptr(sg);
		free_page((unsigned long)sgl);
		sgl = sg;
656
	}
C
Chris Wilson 已提交
657
}
658

C
Chris Wilson 已提交
659 660 661 662 663 664
static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
			       struct i915_gpu_state *error)
{
	struct drm_i915_error_object *obj;
	struct timespec64 ts;
	int i, j;
665

666 667
	if (*error->error_msg)
		err_printf(m, "%s\n", error->error_msg);
668 669 670
	err_printf(m, "Kernel: %s %s\n",
		   init_utsname()->release,
		   init_utsname()->machine);
A
Arnd Bergmann 已提交
671 672 673 674 675 676 677 678 679
	ts = ktime_to_timespec64(error->time);
	err_printf(m, "Time: %lld s %ld us\n",
		   (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
	ts = ktime_to_timespec64(error->boottime);
	err_printf(m, "Boottime: %lld s %ld us\n",
		   (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
	ts = ktime_to_timespec64(error->uptime);
	err_printf(m, "Uptime: %lld s %ld us\n",
		   (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
680 681 682 683 684
	err_printf(m, "Epoch: %lu jiffies (%u HZ)\n", error->epoch, HZ);
	err_printf(m, "Capture: %lu jiffies; %d ms ago, %d ms after epoch\n",
		   error->capture,
		   jiffies_to_msecs(jiffies - error->capture),
		   jiffies_to_msecs(error->capture - error->epoch));
685

686
	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
687
		if (error->engine[i].hangcheck_stalled &&
688
		    error->engine[i].context.pid) {
689
			err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
690
				   engine_name(m->i915, i),
691 692
				   error->engine[i].context.comm,
				   error->engine[i].context.pid,
693 694
				   error->engine[i].context.ban_score,
				   bannable(&error->engine[i].context));
695 696
		}
	}
697
	err_printf(m, "Reset count: %u\n", error->reset_count);
698
	err_printf(m, "Suspend count: %u\n", error->suspend_count);
699
	err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform));
C
Chris Wilson 已提交
700
	err_print_pciid(m, m->i915);
701

702
	err_printf(m, "IOMMU enabled?: %d\n", error->iommu);
703

C
Chris Wilson 已提交
704 705
	if (HAS_CSR(m->i915)) {
		struct intel_csr *csr = &m->i915->csr;
706 707 708 709 710 711 712 713

		err_printf(m, "DMC loaded: %s\n",
			   yesno(csr->dmc_payload != NULL));
		err_printf(m, "DMC fw version: %d.%d\n",
			   CSR_VERSION_MAJOR(csr->version),
			   CSR_VERSION_MINOR(csr->version));
	}

714
	err_printf(m, "GT awake: %s\n", yesno(error->awake));
715 716
	err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock));
	err_printf(m, "PM suspended: %s\n", yesno(error->suspended));
717 718
	err_printf(m, "EIR: 0x%08x\n", error->eir);
	err_printf(m, "IER: 0x%08x\n", error->ier);
719 720
	for (i = 0; i < error->ngtier; i++)
		err_printf(m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]);
721 722 723 724
	err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er);
	err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
	err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
	err_printf(m, "CCID: 0x%08x\n", error->ccid);
C
Chris Wilson 已提交
725 726
	err_printf(m, "Missed interrupts: 0x%08lx\n",
		   m->i915->gpu_error.missed_irq_rings);
727

728
	for (i = 0; i < error->nfence; i++)
729 730
		err_printf(m, "  fence[%d] = %08llx\n", i, error->fence[i]);

C
Chris Wilson 已提交
731
	if (INTEL_GEN(m->i915) >= 6) {
732
		err_printf(m, "ERROR: 0x%08x\n", error->error);
733

C
Chris Wilson 已提交
734
		if (INTEL_GEN(m->i915) >= 8)
735 736 737
			err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n",
				   error->fault_data1, error->fault_data0);

738 739 740
		err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg);
	}

741
	if (IS_GEN(m->i915, 7))
742 743
		err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);

744 745
	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
		if (error->engine[i].engine_id != -1)
746
			error_print_engine(m, &error->engine[i], error->epoch);
747
	}
748

749 750 751
	for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) {
		char buf[128];
		int len, first = 1;
752

753 754 755 756 757 758 759 760 761 762
		if (!error->active_vm[i])
			break;

		len = scnprintf(buf, sizeof(buf), "Active (");
		for (j = 0; j < ARRAY_SIZE(error->engine); j++) {
			if (error->engine[j].vm != error->active_vm[i])
				continue;

			len += scnprintf(buf + len, sizeof(buf), "%s%s",
					 first ? "" : ", ",
C
Chris Wilson 已提交
763
					 m->i915->engine[j]->name);
764 765 766 767
			first = 0;
		}
		scnprintf(buf + len, sizeof(buf), ")");
		print_error_buffers(m, buf,
768 769 770
				    error->active_bo[i],
				    error->active_bo_count[i]);
	}
771

772 773 774 775
	print_error_buffers(m, "Pinned (global)",
			    error->pinned_bo,
			    error->pinned_bo_count);

776
	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
777
		const struct drm_i915_error_engine *ee = &error->engine[i];
778 779

		obj = ee->batchbuffer;
780
		if (obj) {
C
Chris Wilson 已提交
781
			err_puts(m, m->i915->engine[i]->name);
782
			if (ee->context.pid)
783
				err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d%s)",
784 785 786 787
					   ee->context.comm,
					   ee->context.pid,
					   ee->context.handle,
					   ee->context.hw_id,
788 789
					   ee->context.ban_score,
					   bannable(&ee->context));
790 791 792
			err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
				   upper_32_bits(obj->gtt_offset),
				   lower_32_bits(obj->gtt_offset));
C
Chris Wilson 已提交
793
			print_error_obj(m, m->i915->engine[i], NULL, obj);
794 795
		}

796
		for (j = 0; j < ee->user_bo_count; j++)
C
Chris Wilson 已提交
797
			print_error_obj(m, m->i915->engine[i],
798 799
					"user", ee->user_bo[j]);

800
		if (ee->num_requests) {
801
			err_printf(m, "%s --- %d requests\n",
C
Chris Wilson 已提交
802
				   m->i915->engine[i]->name,
803
				   ee->num_requests);
804
			for (j = 0; j < ee->num_requests; j++)
805 806 807
				error_print_request(m, " ",
						    &ee->requests[j],
						    error->epoch);
808 809
		}

810 811
		if (IS_ERR(ee->waiters)) {
			err_printf(m, "%s --- ? waiters [unable to acquire spinlock]\n",
C
Chris Wilson 已提交
812
				   m->i915->engine[i]->name);
813
		} else if (ee->num_waiters) {
814
			err_printf(m, "%s --- %d waiters\n",
C
Chris Wilson 已提交
815
				   m->i915->engine[i]->name,
816 817
				   ee->num_waiters);
			for (j = 0; j < ee->num_waiters; j++) {
818
				err_printf(m, " seqno 0x%08x for %s [%d]\n",
819 820 821
					   ee->waiters[j].seqno,
					   ee->waiters[j].comm,
					   ee->waiters[j].pid);
822 823 824
			}
		}

C
Chris Wilson 已提交
825
		print_error_obj(m, m->i915->engine[i],
826
				"ringbuffer", ee->ringbuffer);
827

C
Chris Wilson 已提交
828
		print_error_obj(m, m->i915->engine[i],
829
				"HW Status", ee->hws_page);
830

C
Chris Wilson 已提交
831
		print_error_obj(m, m->i915->engine[i],
832
				"HW context", ee->ctx);
833

C
Chris Wilson 已提交
834
		print_error_obj(m, m->i915->engine[i],
835
				"WA context", ee->wa_ctx);
836

C
Chris Wilson 已提交
837
		print_error_obj(m, m->i915->engine[i],
838
				"WA batchbuffer", ee->wa_batchbuffer);
839

C
Chris Wilson 已提交
840
		print_error_obj(m, m->i915->engine[i],
841
				"NULL context", ee->default_state);
842 843 844 845 846 847
	}

	if (error->overlay)
		intel_overlay_print_error_state(m, error->overlay);

	if (error->display)
848
		intel_display_print_error_state(m, error->display);
849

850 851
	err_print_capabilities(m, &error->device_info, &error->runtime_info,
			       &error->driver_caps);
852
	err_print_params(m, &error->params);
853
	err_print_uc(m, &error->uc);
C
Chris Wilson 已提交
854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885
}

static int err_print_to_sgl(struct i915_gpu_state *error)
{
	struct drm_i915_error_state_buf m;

	if (IS_ERR(error))
		return PTR_ERR(error);

	if (READ_ONCE(error->sgl))
		return 0;

	memset(&m, 0, sizeof(m));
	m.i915 = error->i915;

	__err_print_to_sgl(&m, error);

	if (m.buf) {
		__sg_set_buf(m.cur++, m.buf, m.bytes, m.iter);
		m.bytes = 0;
		m.buf = NULL;
	}
	if (m.cur) {
		GEM_BUG_ON(m.end < m.cur);
		sg_mark_end(m.cur - 1);
	}
	GEM_BUG_ON(m.sgl && !m.cur);

	if (m.err) {
		err_free_sgl(m.sgl);
		return m.err;
	}
886

C
Chris Wilson 已提交
887 888
	if (cmpxchg(&error->sgl, NULL, m.sgl))
		err_free_sgl(m.sgl);
889 890 891 892

	return 0;
}

C
Chris Wilson 已提交
893 894
ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error,
				      char *buf, loff_t off, size_t rem)
895
{
C
Chris Wilson 已提交
896 897 898 899
	struct scatterlist *sg;
	size_t count;
	loff_t pos;
	int err;
900

C
Chris Wilson 已提交
901 902
	if (!error || !rem)
		return 0;
903

C
Chris Wilson 已提交
904 905 906
	err = err_print_to_sgl(error);
	if (err)
		return err;
907

C
Chris Wilson 已提交
908 909 910 911 912
	sg = READ_ONCE(error->fit);
	if (!sg || off < sg->dma_address)
		sg = error->sgl;
	if (!sg)
		return 0;
913

C
Chris Wilson 已提交
914 915 916 917 918 919 920 921 922
	pos = sg->dma_address;
	count = 0;
	do {
		size_t len, start;

		if (sg_is_chain(sg)) {
			sg = sg_chain_ptr(sg);
			GEM_BUG_ON(sg_is_chain(sg));
		}
923

C
Chris Wilson 已提交
924 925 926 927 928
		len = sg->length;
		if (pos + len <= off) {
			pos += len;
			continue;
		}
929

C
Chris Wilson 已提交
930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954
		start = sg->offset;
		if (pos < off) {
			GEM_BUG_ON(off - pos > len);
			len -= off - pos;
			start += off - pos;
			pos = off;
		}

		len = min(len, rem);
		GEM_BUG_ON(!len || len > sg->length);

		memcpy(buf, page_address(sg_page(sg)) + start, len);

		count += len;
		pos += len;

		buf += len;
		rem -= len;
		if (!rem) {
			WRITE_ONCE(error->fit, sg);
			break;
		}
	} while (!sg_is_last(sg++));

	return count;
955 956 957 958 959 960 961 962 963 964
}

static void i915_error_object_free(struct drm_i915_error_object *obj)
{
	int page;

	if (obj == NULL)
		return;

	for (page = 0; page < obj->page_count; page++)
965
		free_page((unsigned long)obj->pages[page]);
966 967 968 969

	kfree(obj);
}

970

971 972
static void cleanup_params(struct i915_gpu_state *error)
{
973
	i915_params_free(&error->params);
974 975
}

976 977 978 979 980 981
static void cleanup_uc_state(struct i915_gpu_state *error)
{
	struct i915_error_uc *error_uc = &error->uc;

	kfree(error_uc->guc_fw.path);
	kfree(error_uc->huc_fw.path);
982
	i915_error_object_free(error_uc->guc_log);
983 984
}

985
void __i915_gpu_state_free(struct kref *error_ref)
986
{
987 988
	struct i915_gpu_state *error =
		container_of(error_ref, typeof(*error), ref);
989
	long i, j;
990

991 992 993
	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
		struct drm_i915_error_engine *ee = &error->engine[i];

994 995 996 997
		for (j = 0; j < ee->user_bo_count; j++)
			i915_error_object_free(ee->user_bo[j]);
		kfree(ee->user_bo);

998 999 1000 1001 1002 1003 1004 1005
		i915_error_object_free(ee->batchbuffer);
		i915_error_object_free(ee->wa_batchbuffer);
		i915_error_object_free(ee->ringbuffer);
		i915_error_object_free(ee->hws_page);
		i915_error_object_free(ee->ctx);
		i915_error_object_free(ee->wa_ctx);

		kfree(ee->requests);
1006 1007
		if (!IS_ERR_OR_NULL(ee->waiters))
			kfree(ee->waiters);
1008 1009
	}

1010
	for (i = 0; i < ARRAY_SIZE(error->active_bo); i++)
1011 1012
		kfree(error->active_bo[i]);
	kfree(error->pinned_bo);
1013

1014 1015
	kfree(error->overlay);
	kfree(error->display);
1016

1017
	cleanup_params(error);
1018 1019
	cleanup_uc_state(error);

C
Chris Wilson 已提交
1020
	err_free_sgl(error->sgl);
1021 1022 1023 1024
	kfree(error);
}

static struct drm_i915_error_object *
1025
i915_error_object_create(struct drm_i915_private *i915,
C
Chris Wilson 已提交
1026
			 struct i915_vma *vma)
1027
{
1028 1029
	struct i915_ggtt *ggtt = &i915->ggtt;
	const u64 slot = ggtt->error_capture.start;
1030
	struct drm_i915_error_object *dst;
1031
	struct compress compress;
1032 1033 1034
	unsigned long num_pages;
	struct sgt_iter iter;
	dma_addr_t dma;
1035
	int ret;
1036

1037
	if (!vma || !vma->pages)
C
Chris Wilson 已提交
1038 1039
		return NULL;

1040
	num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
1041
	num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
1042 1043
	dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *),
		      GFP_ATOMIC | __GFP_NOWARN);
C
Chris Wilson 已提交
1044
	if (!dst)
1045 1046
		return NULL;

1047 1048
	dst->gtt_offset = vma->node.start;
	dst->gtt_size = vma->node.size;
1049
	dst->num_pages = num_pages;
1050
	dst->page_count = 0;
1051 1052
	dst->unused = 0;

1053
	if (!compress_init(&compress)) {
1054 1055 1056
		kfree(dst);
		return NULL;
	}
1057

1058
	ret = -EINVAL;
1059 1060
	for_each_sgt_dma(dma, iter, vma->pages) {
		void __iomem *s;
1061

1062
		ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0);
1063

1064
		s = io_mapping_map_atomic_wc(&ggtt->iomap, slot);
1065
		ret = compress_page(&compress, (void  __force *)s, dst);
1066 1067
		io_mapping_unmap_atomic(s);
		if (ret)
1068
			break;
1069 1070
	}

1071 1072 1073 1074 1075 1076
	if (ret || compress_flush(&compress, dst)) {
		while (dst->page_count--)
			free_page((unsigned long)dst->pages[dst->page_count]);
		kfree(dst);
		dst = NULL;
	}
1077

1078
	compress_fini(&compress, dst);
1079
	return dst;
1080 1081
}

1082 1083 1084
/* The error capture is special as tries to run underneath the normal
 * locking rules - so we use the raw version of the i915_gem_active lookup.
 */
1085
static inline u32
1086 1087
__active_get_seqno(struct i915_gem_active *active)
{
1088
	struct i915_request *request;
1089 1090 1091

	request = __i915_gem_active_peek(active);
	return request ? request->global_seqno : 0;
1092 1093 1094 1095 1096
}

static inline int
__active_get_engine_id(struct i915_gem_active *active)
{
1097
	struct i915_request *request;
1098

1099 1100
	request = __i915_gem_active_peek(active);
	return request ? request->engine->id : -1;
1101 1102
}

1103
static void capture_bo(struct drm_i915_error_buffer *err,
1104
		       struct i915_vma *vma)
1105
{
1106 1107
	struct drm_i915_gem_object *obj = vma->obj;

1108 1109
	err->size = obj->base.size;
	err->name = obj->base.name;
1110

1111 1112
	err->wseqno = __active_get_seqno(&obj->frontbuffer_write);
	err->engine = __active_get_engine_id(&obj->frontbuffer_write);
1113

1114
	err->gtt_offset = vma->node.start;
1115 1116
	err->read_domains = obj->read_domains;
	err->write_domain = obj->write_domain;
1117
	err->fence_reg = vma->fence ? vma->fence->id : -1;
1118
	err->tiling = i915_gem_object_get_tiling(obj);
C
Chris Wilson 已提交
1119 1120
	err->dirty = obj->mm.dirty;
	err->purgeable = obj->mm.madv != I915_MADV_WILLNEED;
1121
	err->userptr = obj->userptr.mm != NULL;
1122 1123 1124
	err->cache_level = obj->cache_level;
}

1125 1126 1127
static u32 capture_error_bo(struct drm_i915_error_buffer *err,
			    int count, struct list_head *head,
			    bool pinned_only)
1128
{
B
Ben Widawsky 已提交
1129
	struct i915_vma *vma;
1130 1131
	int i = 0;

1132
	list_for_each_entry(vma, head, vm_link) {
1133 1134 1135
		if (!vma->obj)
			continue;

1136 1137 1138
		if (pinned_only && !i915_vma_is_pinned(vma))
			continue;

1139
		capture_bo(err++, vma);
1140 1141 1142 1143 1144 1145 1146
		if (++i == count)
			break;
	}

	return i;
}

1147 1148 1149 1150 1151 1152 1153 1154 1155
/* Generate a semi-unique error code. The code is not meant to have meaning, The
 * code's only purpose is to try to prevent false duplicated bug reports by
 * grossly estimating a GPU error state.
 *
 * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
 * the hang if we could strip the GTT offset information from it.
 *
 * It's only a small step better than a random number in its current form.
 */
1156 1157 1158
static u32 i915_error_generate_code(struct drm_i915_private *dev_priv,
				    struct i915_gpu_state *error,
				    int *engine_id)
1159
{
1160
	u32 error_code = 0;
1161 1162 1163 1164 1165 1166 1167
	int i;

	/* IPEHR would be an ideal way to detect errors, as it's the gross
	 * measure of "the command that hung." However, has some very common
	 * synchronization commands which almost always appear in the case
	 * strictly a client bug. Use instdone to differentiate those some.
	 */
1168
	for (i = 0; i < I915_NUM_ENGINES; i++) {
1169
		if (error->engine[i].hangcheck_stalled) {
1170 1171
			if (engine_id)
				*engine_id = i;
1172

1173 1174
			return error->engine[i].ipehr ^
			       error->engine[i].instdone.instdone;
1175 1176
		}
	}
1177 1178 1179 1180

	return error_code;
}

1181
static void gem_record_fences(struct i915_gpu_state *error)
1182
{
1183
	struct drm_i915_private *dev_priv = error->i915;
1184 1185
	int i;

1186
	if (INTEL_GEN(dev_priv) >= 6) {
1187
		for (i = 0; i < dev_priv->num_fence_regs; i++)
1188 1189
			error->fence[i] = I915_READ64(FENCE_REG_GEN6_LO(i));
	} else if (INTEL_GEN(dev_priv) >= 4) {
1190 1191
		for (i = 0; i < dev_priv->num_fence_regs; i++)
			error->fence[i] = I915_READ64(FENCE_REG_965_LO(i));
1192
	} else {
1193
		for (i = 0; i < dev_priv->num_fence_regs; i++)
1194
			error->fence[i] = I915_READ(FENCE_REG(i));
1195
	}
1196
	error->nfence = i;
1197 1198
}

1199 1200
static void gen6_record_semaphore_state(struct intel_engine_cs *engine,
					struct drm_i915_error_engine *ee)
1201
{
1202 1203 1204 1205
	struct drm_i915_private *dev_priv = engine->i915;

	ee->semaphore_mboxes[0] = I915_READ(RING_SYNC_0(engine->mmio_base));
	ee->semaphore_mboxes[1] = I915_READ(RING_SYNC_1(engine->mmio_base));
1206
	if (HAS_VEBOX(dev_priv))
1207
		ee->semaphore_mboxes[2] =
1208
			I915_READ(RING_SYNC_2(engine->mmio_base));
1209 1210
}

1211 1212
static void error_record_engine_waiters(struct intel_engine_cs *engine,
					struct drm_i915_error_engine *ee)
1213 1214 1215 1216 1217 1218
{
	struct intel_breadcrumbs *b = &engine->breadcrumbs;
	struct drm_i915_error_waiter *waiter;
	struct rb_node *rb;
	int count;

1219 1220
	ee->num_waiters = 0;
	ee->waiters = NULL;
1221

1222 1223 1224
	if (RB_EMPTY_ROOT(&b->waiters))
		return;

1225
	if (!spin_trylock_irq(&b->rb_lock)) {
1226 1227 1228 1229
		ee->waiters = ERR_PTR(-EDEADLK);
		return;
	}

1230 1231 1232
	count = 0;
	for (rb = rb_first(&b->waiters); rb != NULL; rb = rb_next(rb))
		count++;
1233
	spin_unlock_irq(&b->rb_lock);
1234 1235 1236 1237 1238 1239 1240 1241 1242

	waiter = NULL;
	if (count)
		waiter = kmalloc_array(count,
				       sizeof(struct drm_i915_error_waiter),
				       GFP_ATOMIC);
	if (!waiter)
		return;

1243
	if (!spin_trylock_irq(&b->rb_lock)) {
1244 1245 1246 1247
		kfree(waiter);
		ee->waiters = ERR_PTR(-EDEADLK);
		return;
	}
1248

1249
	ee->waiters = waiter;
1250
	for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
G
Geliang Tang 已提交
1251
		struct intel_wait *w = rb_entry(rb, typeof(*w), node);
1252 1253 1254 1255 1256 1257

		strcpy(waiter->comm, w->tsk->comm);
		waiter->pid = w->tsk->pid;
		waiter->seqno = w->seqno;
		waiter++;

1258
		if (++ee->num_waiters == count)
1259 1260
			break;
	}
1261
	spin_unlock_irq(&b->rb_lock);
1262 1263
}

1264
static void error_record_engine_registers(struct i915_gpu_state *error,
1265 1266
					  struct intel_engine_cs *engine,
					  struct drm_i915_error_engine *ee)
1267
{
1268 1269
	struct drm_i915_private *dev_priv = engine->i915;

1270
	if (INTEL_GEN(dev_priv) >= 6) {
1271
		ee->rc_psmi = I915_READ(RING_PSMI_CTL(engine->mmio_base));
1272 1273 1274
		if (INTEL_GEN(dev_priv) >= 8) {
			ee->fault_reg = I915_READ(GEN8_RING_FAULT_REG);
		} else {
1275
			gen6_record_semaphore_state(engine, ee);
1276 1277
			ee->fault_reg = I915_READ(RING_FAULT_REG(engine));
		}
1278 1279
	}

1280
	if (INTEL_GEN(dev_priv) >= 4) {
1281 1282 1283 1284 1285
		ee->faddr = I915_READ(RING_DMA_FADD(engine->mmio_base));
		ee->ipeir = I915_READ(RING_IPEIR(engine->mmio_base));
		ee->ipehr = I915_READ(RING_IPEHR(engine->mmio_base));
		ee->instps = I915_READ(RING_INSTPS(engine->mmio_base));
		ee->bbaddr = I915_READ(RING_BBADDR(engine->mmio_base));
1286
		if (INTEL_GEN(dev_priv) >= 8) {
1287 1288
			ee->faddr |= (u64) I915_READ(RING_DMA_FADD_UDW(engine->mmio_base)) << 32;
			ee->bbaddr |= (u64) I915_READ(RING_BBADDR_UDW(engine->mmio_base)) << 32;
1289
		}
1290
		ee->bbstate = I915_READ(RING_BBSTATE(engine->mmio_base));
1291
	} else {
1292 1293 1294
		ee->faddr = I915_READ(DMA_FADD_I8XX);
		ee->ipeir = I915_READ(IPEIR);
		ee->ipehr = I915_READ(IPEHR);
1295 1296
	}

1297
	intel_engine_get_instdone(engine, &ee->instdone);
1298

1299 1300
	ee->waiting = intel_engine_has_waiter(engine);
	ee->instpm = I915_READ(RING_INSTPM(engine->mmio_base));
1301
	ee->acthd = intel_engine_get_active_head(engine);
1302
	ee->seqno = intel_engine_get_seqno(engine);
1303
	ee->last_seqno = intel_engine_last_submit(engine);
1304 1305 1306 1307
	ee->start = I915_READ_START(engine);
	ee->head = I915_READ_HEAD(engine);
	ee->tail = I915_READ_TAIL(engine);
	ee->ctl = I915_READ_CTL(engine);
1308 1309
	if (INTEL_GEN(dev_priv) > 2)
		ee->mode = I915_READ_MODE(engine);
1310

1311
	if (!HWS_NEEDS_PHYSICAL(dev_priv)) {
1312
		i915_reg_t mmio;
1313

1314
		if (IS_GEN(dev_priv, 7)) {
1315
			switch (engine->id) {
1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329
			default:
			case RCS:
				mmio = RENDER_HWS_PGA_GEN7;
				break;
			case BCS:
				mmio = BLT_HWS_PGA_GEN7;
				break;
			case VCS:
				mmio = BSD_HWS_PGA_GEN7;
				break;
			case VECS:
				mmio = VEBOX_HWS_PGA_GEN7;
				break;
			}
1330
		} else if (IS_GEN(engine->i915, 6)) {
1331
			mmio = RING_HWS_PGA_GEN6(engine->mmio_base);
1332 1333
		} else {
			/* XXX: gen8 returns to sanity */
1334
			mmio = RING_HWS_PGA(engine->mmio_base);
1335 1336
		}

1337
		ee->hws = I915_READ(mmio);
1338 1339
	}

1340
	ee->idle = intel_engine_is_idle(engine);
1341
	ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
1342
	ee->hangcheck_action = engine->hangcheck.action;
1343
	ee->hangcheck_stalled = engine->hangcheck.stalled;
1344 1345
	ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
						  engine);
1346

1347
	if (HAS_PPGTT(dev_priv)) {
1348 1349
		int i;

1350
		ee->vm_info.gfx_mode = I915_READ(RING_MODE_GEN7(engine));
1351

1352
		if (IS_GEN(dev_priv, 6))
1353
			ee->vm_info.pp_dir_base =
1354
				I915_READ(RING_PP_DIR_BASE_READ(engine));
1355
		else if (IS_GEN(dev_priv, 7))
1356
			ee->vm_info.pp_dir_base =
1357
				I915_READ(RING_PP_DIR_BASE(engine));
1358
		else if (INTEL_GEN(dev_priv) >= 8)
1359
			for (i = 0; i < 4; i++) {
1360
				ee->vm_info.pdp[i] =
1361
					I915_READ(GEN8_RING_PDP_UDW(engine, i));
1362 1363
				ee->vm_info.pdp[i] <<= 32;
				ee->vm_info.pdp[i] |=
1364
					I915_READ(GEN8_RING_PDP_LDW(engine, i));
1365 1366
			}
	}
1367 1368
}

1369
static void record_request(struct i915_request *request,
1370 1371
			   struct drm_i915_error_request *erq)
{
C
Chris Wilson 已提交
1372 1373 1374
	struct i915_gem_context *ctx = request->gem_context;

	erq->context = ctx->hw_id;
1375
	erq->sched_attr = request->sched.attr;
C
Chris Wilson 已提交
1376
	erq->ban_score = atomic_read(&ctx->ban_score);
1377
	erq->seqno = request->global_seqno;
1378
	erq->jiffies = request->emitted_jiffies;
1379
	erq->start = i915_ggtt_offset(request->ring->vma);
1380 1381 1382 1383
	erq->head = request->head;
	erq->tail = request->tail;

	rcu_read_lock();
C
Chris Wilson 已提交
1384
	erq->pid = ctx->pid ? pid_nr(ctx->pid) : 0;
1385 1386 1387
	rcu_read_unlock();
}

1388
static void engine_record_requests(struct intel_engine_cs *engine,
1389
				   struct i915_request *first,
1390 1391
				   struct drm_i915_error_engine *ee)
{
1392
	struct i915_request *request;
1393 1394 1395 1396
	int count;

	count = 0;
	request = first;
1397
	list_for_each_entry_from(request, &engine->timeline.requests, link)
1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409
		count++;
	if (!count)
		return;

	ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC);
	if (!ee->requests)
		return;

	ee->num_requests = count;

	count = 0;
	request = first;
1410
	list_for_each_entry_from(request, &engine->timeline.requests, link) {
1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429
		if (count >= ee->num_requests) {
			/*
			 * If the ring request list was changed in
			 * between the point where the error request
			 * list was created and dimensioned and this
			 * point then just exit early to avoid crashes.
			 *
			 * We don't need to communicate that the
			 * request list changed state during error
			 * state capture and that the error state is
			 * slightly incorrect as a consequence since we
			 * are typically only interested in the request
			 * list state at the point of error state
			 * capture, not in any changes happening during
			 * the capture.
			 */
			break;
		}

1430
		record_request(request, &ee->requests[count++]);
1431 1432 1433 1434
	}
	ee->num_requests = count;
}

1435 1436 1437
static void error_record_engine_execlists(struct intel_engine_cs *engine,
					  struct drm_i915_error_engine *ee)
{
1438
	const struct intel_engine_execlists * const execlists = &engine->execlists;
1439 1440
	unsigned int n;

1441
	for (n = 0; n < execlists_num_ports(execlists); n++) {
1442
		struct i915_request *rq = port_request(&execlists->port[n]);
1443 1444 1445 1446 1447 1448

		if (!rq)
			break;

		record_request(rq, &ee->execlist[n]);
	}
1449 1450

	ee->num_ports = n;
1451 1452
}

1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469
static void record_context(struct drm_i915_error_context *e,
			   struct i915_gem_context *ctx)
{
	if (ctx->pid) {
		struct task_struct *task;

		rcu_read_lock();
		task = pid_task(ctx->pid, PIDTYPE_PID);
		if (task) {
			strcpy(e->comm, task->comm);
			e->pid = task->pid;
		}
		rcu_read_unlock();
	}

	e->handle = ctx->user_handle;
	e->hw_id = ctx->hw_id;
1470
	e->sched_attr = ctx->sched;
1471
	e->ban_score = atomic_read(&ctx->ban_score);
1472
	e->bannable = i915_gem_context_is_bannable(ctx);
1473 1474
	e->guilty = atomic_read(&ctx->guilty_count);
	e->active = atomic_read(&ctx->active_count);
1475 1476
}

1477
static void request_record_user_bo(struct i915_request *request,
1478 1479
				   struct drm_i915_error_engine *ee)
{
1480
	struct i915_capture_list *c;
1481
	struct drm_i915_error_object **bo;
1482
	long count, max;
1483

1484
	max = 0;
1485
	for (c = request->capture_list; c; c = c->next)
1486 1487 1488
		max++;
	if (!max)
		return;
1489

1490 1491 1492 1493 1494 1495
	bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
	if (!bo) {
		/* If we can't capture everything, try to capture something. */
		max = min_t(long, max, PAGE_SIZE / sizeof(*bo));
		bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
	}
1496 1497 1498 1499 1500 1501 1502 1503
	if (!bo)
		return;

	count = 0;
	for (c = request->capture_list; c; c = c->next) {
		bo[count] = i915_error_object_create(request->i915, c->vma);
		if (!bo[count])
			break;
1504 1505
		if (++count == max)
			break;
1506 1507 1508 1509 1510 1511
	}

	ee->user_bo = bo;
	ee->user_bo_count = count;
}

1512 1513 1514 1515 1516 1517 1518
static struct drm_i915_error_object *
capture_object(struct drm_i915_private *dev_priv,
	       struct drm_i915_gem_object *obj)
{
	if (obj && i915_gem_object_has_pages(obj)) {
		struct i915_vma fake = {
			.node = { .start = U64_MAX, .size = obj->base.size },
1519
			.size = obj->base.size,
1520 1521 1522 1523 1524 1525 1526 1527 1528 1529
			.pages = obj->mm.pages,
			.obj = obj,
		};

		return i915_error_object_create(dev_priv, &fake);
	} else {
		return NULL;
	}
}

1530
static void gem_record_rings(struct i915_gpu_state *error)
1531
{
1532 1533
	struct drm_i915_private *i915 = error->i915;
	struct i915_ggtt *ggtt = &i915->ggtt;
1534
	int i;
1535

1536
	for (i = 0; i < I915_NUM_ENGINES; i++) {
1537
		struct intel_engine_cs *engine = i915->engine[i];
1538
		struct drm_i915_error_engine *ee = &error->engine[i];
1539
		struct i915_request *request;
1540

1541
		ee->engine_id = -1;
1542

1543
		if (!engine)
1544 1545
			continue;

1546
		ee->engine_id = i;
1547

1548 1549
		error_record_engine_registers(error, engine, ee);
		error_record_engine_waiters(engine, ee);
1550
		error_record_engine_execlists(engine, ee);
1551

1552
		request = i915_gem_find_active_request(engine);
1553
		if (request) {
C
Chris Wilson 已提交
1554
			struct i915_gem_context *ctx = request->gem_context;
1555
			struct intel_ring *ring;
1556

1557
			ee->vm = ctx->ppgtt ? &ctx->ppgtt->vm : &ggtt->vm;
1558

C
Chris Wilson 已提交
1559
			record_context(&ee->context, ctx);
1560

1561 1562 1563 1564
			/* We need to copy these to an anonymous buffer
			 * as the simplest method to avoid being overwritten
			 * by userspace.
			 */
1565
			ee->batchbuffer =
1566
				i915_error_object_create(i915, request->batch);
1567

1568
			if (HAS_BROKEN_CS_TLB(i915))
1569
				ee->wa_batchbuffer =
1570
					i915_error_object_create(i915,
1571
								 i915->gt.scratch);
1572
			request_record_user_bo(request, ee);
1573

C
Chris Wilson 已提交
1574
			ee->ctx =
1575
				i915_error_object_create(i915,
1576
							 request->hw_context->state);
1577

1578
			error->simulated |=
C
Chris Wilson 已提交
1579
				i915_gem_context_no_error_capture(ctx);
1580

1581 1582 1583 1584
			ee->rq_head = request->head;
			ee->rq_post = request->postfix;
			ee->rq_tail = request->tail;

1585 1586 1587
			ring = request->ring;
			ee->cpu_ring_head = ring->head;
			ee->cpu_ring_tail = ring->tail;
1588
			ee->ringbuffer =
1589
				i915_error_object_create(i915, ring->vma);
1590 1591

			engine_record_requests(engine, request, ee);
1592
		}
1593

1594
		ee->hws_page =
1595
			i915_error_object_create(i915,
C
Chris Wilson 已提交
1596
						 engine->status_page.vma);
1597

1598
		ee->wa_ctx = i915_error_object_create(i915, engine->wa_ctx.vma);
1599

1600
		ee->default_state = capture_object(i915, engine->default_state);
1601 1602 1603
	}
}

1604 1605 1606
static void gem_capture_vm(struct i915_gpu_state *error,
			   struct i915_address_space *vm,
			   int idx)
1607
{
1608
	struct drm_i915_error_buffer *active_bo;
1609
	struct i915_vma *vma;
1610
	int count;
1611

1612
	count = 0;
1613
	list_for_each_entry(vma, &vm->active_list, vm_link)
1614
		count++;
1615

1616 1617 1618
	active_bo = NULL;
	if (count)
		active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC);
1619
	if (active_bo)
1620 1621 1622 1623 1624 1625 1626
		count = capture_error_bo(active_bo, count, &vm->active_list, false);
	else
		count = 0;

	error->active_vm[idx] = vm;
	error->active_bo[idx] = active_bo;
	error->active_bo_count[idx] = count;
1627 1628
}

1629
static void capture_active_buffers(struct i915_gpu_state *error)
1630
{
1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643
	int cnt = 0, i, j;

	BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo));
	BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm));
	BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count));

	/* Scan each engine looking for unique active contexts/vm */
	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
		struct drm_i915_error_engine *ee = &error->engine[i];
		bool found;

		if (!ee->vm)
			continue;
1644

1645 1646 1647 1648
		found = false;
		for (j = 0; j < i && !found; j++)
			found = error->engine[j].vm == ee->vm;
		if (!found)
1649
			gem_capture_vm(error, ee->vm, cnt++);
1650
	}
1651 1652
}

1653
static void capture_pinned_buffers(struct i915_gpu_state *error)
1654
{
1655
	struct i915_address_space *vm = &error->i915->ggtt.vm;
1656 1657 1658 1659 1660
	struct drm_i915_error_buffer *bo;
	struct i915_vma *vma;
	int count_inactive, count_active;

	count_inactive = 0;
1661
	list_for_each_entry(vma, &vm->inactive_list, vm_link)
1662 1663 1664
		count_inactive++;

	count_active = 0;
1665
	list_for_each_entry(vma, &vm->active_list, vm_link)
1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682
		count_active++;

	bo = NULL;
	if (count_inactive + count_active)
		bo = kcalloc(count_inactive + count_active,
			     sizeof(*bo), GFP_ATOMIC);
	if (!bo)
		return;

	count_inactive = capture_error_bo(bo, count_inactive,
					  &vm->active_list, true);
	count_active = capture_error_bo(bo + count_inactive, count_active,
					&vm->inactive_list, true);
	error->pinned_bo_count = count_inactive + count_active;
	error->pinned_bo = bo;
}

1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700
static void capture_uc_state(struct i915_gpu_state *error)
{
	struct drm_i915_private *i915 = error->i915;
	struct i915_error_uc *error_uc = &error->uc;

	/* Capturing uC state won't be useful if there is no GuC */
	if (!error->device_info.has_guc)
		return;

	error_uc->guc_fw = i915->guc.fw;
	error_uc->huc_fw = i915->huc.fw;

	/* Non-default firmware paths will be specified by the modparam.
	 * As modparams are generally accesible from the userspace make
	 * explicit copies of the firmware paths.
	 */
	error_uc->guc_fw.path = kstrdup(i915->guc.fw.path, GFP_ATOMIC);
	error_uc->huc_fw.path = kstrdup(i915->huc.fw.path, GFP_ATOMIC);
1701
	error_uc->guc_log = i915_error_object_create(i915, i915->guc.log.vma);
1702 1703
}

1704
/* Capture all registers which don't fit into another category. */
1705
static void capture_reg_state(struct i915_gpu_state *error)
1706
{
1707
	struct drm_i915_private *dev_priv = error->i915;
1708
	int i;
1709

1710 1711 1712 1713 1714 1715 1716
	/* General organization
	 * 1. Registers specific to a single generation
	 * 2. Registers which belong to multiple generations
	 * 3. Feature specific registers.
	 * 4. Everything else
	 * Please try to follow the order.
	 */
1717

1718
	/* 1: Registers specific to a single generation */
1719
	if (IS_VALLEYVIEW(dev_priv)) {
1720
		error->gtier[0] = I915_READ(GTIER);
1721
		error->ier = I915_READ(VLV_IER);
1722
		error->forcewake = I915_READ_FW(FORCEWAKE_VLV);
1723
	}
1724

1725
	if (IS_GEN(dev_priv, 7))
1726
		error->err_int = I915_READ(GEN7_ERR_INT);
1727

1728
	if (INTEL_GEN(dev_priv) >= 8) {
1729 1730 1731 1732
		error->fault_data0 = I915_READ(GEN8_FAULT_TLB_DATA0);
		error->fault_data1 = I915_READ(GEN8_FAULT_TLB_DATA1);
	}

1733
	if (IS_GEN(dev_priv, 6)) {
1734
		error->forcewake = I915_READ_FW(FORCEWAKE);
1735 1736 1737
		error->gab_ctl = I915_READ(GAB_CTL);
		error->gfx_mode = I915_READ(GFX_MODE);
	}
1738

1739
	/* 2: Registers which belong to multiple generations */
1740
	if (INTEL_GEN(dev_priv) >= 7)
1741
		error->forcewake = I915_READ_FW(FORCEWAKE_MT);
1742

1743
	if (INTEL_GEN(dev_priv) >= 6) {
1744
		error->derrmr = I915_READ(DERRMR);
1745 1746 1747 1748
		error->error = I915_READ(ERROR_GEN6);
		error->done_reg = I915_READ(DONE_REG);
	}

J
Joonas Lahtinen 已提交
1749
	if (INTEL_GEN(dev_priv) >= 5)
1750 1751
		error->ccid = I915_READ(CCID);

1752
	/* 3: Feature specific registers */
1753
	if (IS_GEN_RANGE(dev_priv, 6, 7)) {
1754 1755 1756 1757 1758
		error->gam_ecochk = I915_READ(GAM_ECOCHK);
		error->gac_eco = I915_READ(GAC_ECO_BITS);
	}

	/* 4: Everything else */
1759 1760 1761 1762 1763 1764 1765 1766 1767 1768
	if (INTEL_GEN(dev_priv) >= 11) {
		error->ier = I915_READ(GEN8_DE_MISC_IER);
		error->gtier[0] = I915_READ(GEN11_RENDER_COPY_INTR_ENABLE);
		error->gtier[1] = I915_READ(GEN11_VCS_VECS_INTR_ENABLE);
		error->gtier[2] = I915_READ(GEN11_GUC_SG_INTR_ENABLE);
		error->gtier[3] = I915_READ(GEN11_GPM_WGBOXPERF_INTR_ENABLE);
		error->gtier[4] = I915_READ(GEN11_CRYPTO_RSVD_INTR_ENABLE);
		error->gtier[5] = I915_READ(GEN11_GUNIT_CSME_INTR_ENABLE);
		error->ngtier = 6;
	} else if (INTEL_GEN(dev_priv) >= 8) {
1769 1770 1771
		error->ier = I915_READ(GEN8_DE_MISC_IER);
		for (i = 0; i < 4; i++)
			error->gtier[i] = I915_READ(GEN8_GT_IER(i));
1772
		error->ngtier = 4;
1773
	} else if (HAS_PCH_SPLIT(dev_priv)) {
1774
		error->ier = I915_READ(DEIER);
1775
		error->gtier[0] = I915_READ(GTIER);
1776
		error->ngtier = 1;
1777
	} else if (IS_GEN(dev_priv, 2)) {
1778
		error->ier = I915_READ16(IER);
1779
	} else if (!IS_VALLEYVIEW(dev_priv)) {
1780
		error->ier = I915_READ(IER);
1781 1782 1783
	}
	error->eir = I915_READ(EIR);
	error->pgtbl_er = I915_READ(PGTBL_ER);
1784 1785
}

1786
static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
1787
				   struct i915_gpu_state *error,
1788
				   u32 engine_mask,
1789
				   const char *error_msg)
1790 1791
{
	u32 ecode;
1792
	int engine_id = -1, len;
1793

1794
	ecode = i915_error_generate_code(dev_priv, error, &engine_id);
1795

1796
	len = scnprintf(error->error_msg, sizeof(error->error_msg),
1797
			"GPU HANG: ecode %d:%d:0x%08x",
1798
			INTEL_GEN(dev_priv), engine_id, ecode);
1799

1800
	if (engine_id != -1 && error->engine[engine_id].context.pid)
1801 1802 1803
		len += scnprintf(error->error_msg + len,
				 sizeof(error->error_msg) - len,
				 ", in %s [%d]",
1804 1805
				 error->engine[engine_id].context.comm,
				 error->engine[engine_id].context.pid);
1806 1807 1808 1809

	scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
		  ", reason: %s, action: %s",
		  error_msg,
1810
		  engine_mask ? "reset" : "continue");
1811 1812
}

1813
static void capture_gen_state(struct i915_gpu_state *error)
1814
{
1815 1816 1817 1818 1819
	struct drm_i915_private *i915 = error->i915;

	error->awake = i915->gt.awake;
	error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count);
	error->suspended = i915->runtime_pm.suspended;
1820

1821 1822 1823 1824
	error->iommu = -1;
#ifdef CONFIG_INTEL_IOMMU
	error->iommu = intel_iommu_gfx_mapped;
#endif
1825 1826
	error->reset_count = i915_reset_count(&i915->gpu_error);
	error->suspend_count = i915->suspend_count;
1827 1828

	memcpy(&error->device_info,
1829
	       INTEL_INFO(i915),
1830
	       sizeof(error->device_info));
1831 1832 1833
	memcpy(&error->runtime_info,
	       RUNTIME_INFO(i915),
	       sizeof(error->runtime_info));
1834
	error->driver_caps = i915->caps;
1835 1836
}

1837 1838
static void capture_params(struct i915_gpu_state *error)
{
1839
	i915_params_copy(&error->params, &i915_modparams);
1840 1841
}

1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857
static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
{
	unsigned long epoch = error->capture;
	int i;

	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
		const struct drm_i915_error_engine *ee = &error->engine[i];

		if (ee->hangcheck_stalled &&
		    time_before(ee->hangcheck_timestamp, epoch))
			epoch = ee->hangcheck_timestamp;
	}

	return epoch;
}

1858 1859 1860 1861 1862 1863 1864 1865
static void capture_finish(struct i915_gpu_state *error)
{
	struct i915_ggtt *ggtt = &error->i915->ggtt;
	const u64 slot = ggtt->error_capture.start;

	ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE);
}

1866 1867
static int capture(void *data)
{
1868
	struct i915_gpu_state *error = data;
1869

A
Arnd Bergmann 已提交
1870 1871 1872 1873
	error->time = ktime_get_real();
	error->boottime = ktime_get_boottime();
	error->uptime = ktime_sub(ktime_get(),
				  error->i915->gt.last_init_time);
1874
	error->capture = jiffies;
1875

1876
	capture_params(error);
1877
	capture_gen_state(error);
1878
	capture_uc_state(error);
1879 1880 1881 1882 1883
	capture_reg_state(error);
	gem_record_fences(error);
	gem_record_rings(error);
	capture_active_buffers(error);
	capture_pinned_buffers(error);
1884 1885 1886 1887

	error->overlay = intel_overlay_capture_error_state(error->i915);
	error->display = intel_display_capture_error_state(error->i915);

1888 1889
	error->epoch = capture_find_epoch(error);

1890
	capture_finish(error);
1891 1892 1893
	return 0;
}

1894 1895
#define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))

1896 1897 1898 1899 1900
struct i915_gpu_state *
i915_capture_gpu_state(struct drm_i915_private *i915)
{
	struct i915_gpu_state *error;

1901 1902 1903 1904 1905
	/* Check if GPU capture has been disabled */
	error = READ_ONCE(i915->gpu_error.first_error);
	if (IS_ERR(error))
		return error;

1906
	error = kzalloc(sizeof(*error), GFP_ATOMIC);
1907 1908 1909 1910
	if (!error) {
		i915_disable_error_state(i915, -ENOMEM);
		return ERR_PTR(-ENOMEM);
	}
1911 1912 1913 1914 1915 1916 1917 1918 1919

	kref_init(&error->ref);
	error->i915 = i915;

	stop_machine(capture, error, NULL);

	return error;
}

1920 1921
/**
 * i915_capture_error_state - capture an error record for later analysis
1922 1923 1924
 * @i915: i915 device
 * @engine_mask: the mask of engines triggering the hang
 * @error_msg: a message to insert into the error capture header
1925 1926 1927 1928 1929 1930
 *
 * Should be called when an error is detected (either a hang or an error
 * interrupt) to capture error state from the time of the error.  Fills
 * out a structure which becomes available in debugfs for user level tools
 * to pick up.
 */
1931
void i915_capture_error_state(struct drm_i915_private *i915,
1932
			      u32 engine_mask,
1933
			      const char *error_msg)
1934
{
1935
	static bool warned;
1936
	struct i915_gpu_state *error;
1937 1938
	unsigned long flags;

1939
	if (!i915_modparams.error_capture)
1940 1941
		return;

1942
	if (READ_ONCE(i915->gpu_error.first_error))
1943 1944
		return;

1945
	error = i915_capture_gpu_state(i915);
1946
	if (IS_ERR(error))
1947 1948
		return;

1949
	i915_error_capture_msg(i915, error, engine_mask, error_msg);
1950 1951
	DRM_INFO("%s\n", error->error_msg);

1952
	if (!error->simulated) {
1953 1954 1955
		spin_lock_irqsave(&i915->gpu_error.lock, flags);
		if (!i915->gpu_error.first_error) {
			i915->gpu_error.first_error = error;
1956 1957
			error = NULL;
		}
1958
		spin_unlock_irqrestore(&i915->gpu_error.lock, flags);
1959 1960
	}

1961
	if (error) {
1962
		__i915_gpu_state_free(&error->ref);
1963 1964 1965
		return;
	}

1966 1967
	if (!warned &&
	    ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) {
1968 1969 1970 1971
		DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
		DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
		DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
		DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
1972
		DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n",
1973
			 i915->drm.primary->index);
1974 1975
		warned = true;
	}
1976 1977
}

1978 1979
struct i915_gpu_state *
i915_first_error_state(struct drm_i915_private *i915)
1980
{
1981
	struct i915_gpu_state *error;
1982

1983 1984
	spin_lock_irq(&i915->gpu_error.lock);
	error = i915->gpu_error.first_error;
1985
	if (!IS_ERR_OR_NULL(error))
1986 1987
		i915_gpu_state_get(error);
	spin_unlock_irq(&i915->gpu_error.lock);
1988

1989
	return error;
1990 1991
}

1992
void i915_reset_error_state(struct drm_i915_private *i915)
1993
{
1994
	struct i915_gpu_state *error;
1995

1996 1997
	spin_lock_irq(&i915->gpu_error.lock);
	error = i915->gpu_error.first_error;
1998 1999
	if (error != ERR_PTR(-ENODEV)) /* if disabled, always disabled */
		i915->gpu_error.first_error = NULL;
2000
	spin_unlock_irq(&i915->gpu_error.lock);
2001

2002
	if (!IS_ERR_OR_NULL(error))
2003 2004 2005 2006 2007 2008 2009 2010 2011
		i915_gpu_state_put(error);
}

void i915_disable_error_state(struct drm_i915_private *i915, int err)
{
	spin_lock_irq(&i915->gpu_error.lock);
	if (!i915->gpu_error.first_error)
		i915->gpu_error.first_error = ERR_PTR(err);
	spin_unlock_irq(&i915->gpu_error.lock);
2012
}