i915_gpu_error.c 51.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
/*
 * Copyright (c) 2008 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 *
 * Authors:
 *    Eric Anholt <eric@anholt.net>
 *    Keith Packard <keithp@keithp.com>
 *    Mika Kuoppala <mika.kuoppala@intel.com>
 *
 */

C
Chris Wilson 已提交
30 31 32
#include <linux/ascii85.h>
#include <linux/nmi.h>
#include <linux/scatterlist.h>
33
#include <linux/stop_machine.h>
C
Chris Wilson 已提交
34
#include <linux/utsname.h>
35
#include <linux/zlib.h>
C
Chris Wilson 已提交
36

37 38
#include <drm/drm_print.h>

39
#include "i915_gpu_error.h"
40 41
#include "i915_drv.h"

42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
static inline const struct intel_engine_cs *
engine_lookup(const struct drm_i915_private *i915, unsigned int id)
{
	if (id >= I915_NUM_ENGINES)
		return NULL;

	return i915->engine[id];
}

static inline const char *
__engine_name(const struct intel_engine_cs *engine)
{
	return engine ? engine->name : "";
}

static const char *
engine_name(const struct drm_i915_private *i915, unsigned int id)
{
	return __engine_name(engine_lookup(i915, id));
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
}

static const char *tiling_flag(int tiling)
{
	switch (tiling) {
	default:
	case I915_TILING_NONE: return "";
	case I915_TILING_X: return " X";
	case I915_TILING_Y: return " Y";
	}
}

static const char *dirty_flag(int dirty)
{
	return dirty ? " dirty" : "";
}

static const char *purgeable_flag(int purgeable)
{
	return purgeable ? " purgeable" : "";
}

C
Chris Wilson 已提交
83 84
static void __sg_set_buf(struct scatterlist *sg,
			 void *addr, unsigned int len, loff_t it)
85
{
C
Chris Wilson 已提交
86 87 88 89
	sg->page_link = (unsigned long)virt_to_page(addr);
	sg->offset = offset_in_page(addr);
	sg->length = len;
	sg->dma_address = it;
90 91
}

C
Chris Wilson 已提交
92
static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
93
{
C
Chris Wilson 已提交
94
	if (!len)
95 96
		return false;

C
Chris Wilson 已提交
97 98 99 100 101 102 103 104
	if (e->bytes + len + 1 <= e->size)
		return true;

	if (e->bytes) {
		__sg_set_buf(e->cur++, e->buf, e->bytes, e->iter);
		e->iter += e->bytes;
		e->buf = NULL;
		e->bytes = 0;
105 106
	}

C
Chris Wilson 已提交
107 108
	if (e->cur == e->end) {
		struct scatterlist *sgl;
109

C
Chris Wilson 已提交
110 111 112 113 114
		sgl = (typeof(sgl))__get_free_page(GFP_KERNEL);
		if (!sgl) {
			e->err = -ENOMEM;
			return false;
		}
115

C
Chris Wilson 已提交
116 117 118 119 120 121 122
		if (e->cur) {
			e->cur->offset = 0;
			e->cur->length = 0;
			e->cur->page_link =
				(unsigned long)sgl | SG_CHAIN;
		} else {
			e->sgl = sgl;
123 124
		}

C
Chris Wilson 已提交
125 126
		e->cur = sgl;
		e->end = sgl + SG_MAX_SINGLE_ALLOC - 1;
127 128
	}

C
Chris Wilson 已提交
129 130 131 132 133 134 135 136 137 138 139 140
	e->size = ALIGN(len + 1, SZ_64K);
	e->buf = kmalloc(e->size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
	if (!e->buf) {
		e->size = PAGE_ALIGN(len + 1);
		e->buf = kmalloc(e->size, GFP_KERNEL);
	}
	if (!e->buf) {
		e->err = -ENOMEM;
		return false;
	}

	return true;
141 142
}

143
__printf(2, 0)
144
static void i915_error_vprintf(struct drm_i915_error_state_buf *e,
C
Chris Wilson 已提交
145
			       const char *fmt, va_list args)
146
{
C
Chris Wilson 已提交
147 148
	va_list ap;
	int len;
149

C
Chris Wilson 已提交
150
	if (e->err)
151 152
		return;

C
Chris Wilson 已提交
153 154 155 156 157 158
	va_copy(ap, args);
	len = vsnprintf(NULL, 0, fmt, ap);
	va_end(ap);
	if (len <= 0) {
		e->err = len;
		return;
159 160
	}

C
Chris Wilson 已提交
161 162
	if (!__i915_error_grow(e, len))
		return;
163

C
Chris Wilson 已提交
164 165 166 167 168 169 170
	GEM_BUG_ON(e->bytes >= e->size);
	len = vscnprintf(e->buf + e->bytes, e->size - e->bytes, fmt, args);
	if (len < 0) {
		e->err = len;
		return;
	}
	e->bytes += len;
171 172
}

C
Chris Wilson 已提交
173
static void i915_error_puts(struct drm_i915_error_state_buf *e, const char *str)
174 175 176
{
	unsigned len;

C
Chris Wilson 已提交
177
	if (e->err || !str)
178 179 180
		return;

	len = strlen(str);
C
Chris Wilson 已提交
181 182
	if (!__i915_error_grow(e, len))
		return;
183

C
Chris Wilson 已提交
184
	GEM_BUG_ON(e->bytes + len > e->size);
185
	memcpy(e->buf + e->bytes, str, len);
C
Chris Wilson 已提交
186
	e->bytes += len;
187 188 189 190 191
}

#define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__)
#define err_puts(e, s) i915_error_puts(e, s)

192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf)
{
	i915_error_vprintf(p->arg, vaf->fmt, *vaf->va);
}

static inline struct drm_printer
i915_error_printer(struct drm_i915_error_state_buf *e)
{
	struct drm_printer p = {
		.printfn = __i915_printfn_error,
		.arg = e,
	};
	return p;
}

207 208
#ifdef CONFIG_DRM_I915_COMPRESS_ERROR

209 210 211 212 213 214
struct compress {
	struct z_stream_s zstream;
	void *tmp;
};

static bool compress_init(struct compress *c)
215
{
216
	struct z_stream_s *zstream = memset(&c->zstream, 0, sizeof(c->zstream));
217 218 219 220 221 222 223 224 225 226 227 228

	zstream->workspace =
		kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
			GFP_ATOMIC | __GFP_NOWARN);
	if (!zstream->workspace)
		return false;

	if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) {
		kfree(zstream->workspace);
		return false;
	}

229
	c->tmp = NULL;
230
	if (i915_has_memcpy_from_wc())
231 232
		c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN);

233 234 235
	return true;
}

236 237 238 239 240 241 242 243 244 245 246 247 248 249
static void *compress_next_page(struct drm_i915_error_object *dst)
{
	unsigned long page;

	if (dst->page_count >= dst->num_pages)
		return ERR_PTR(-ENOSPC);

	page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
	if (!page)
		return ERR_PTR(-ENOMEM);

	return dst->pages[dst->page_count++] = (void *)page;
}

250
static int compress_page(struct compress *c,
251 252 253
			 void *src,
			 struct drm_i915_error_object *dst)
{
254 255
	struct z_stream_s *zstream = &c->zstream;

256
	zstream->next_in = src;
257 258
	if (c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
		zstream->next_in = c->tmp;
259 260 261 262
	zstream->avail_in = PAGE_SIZE;

	do {
		if (zstream->avail_out == 0) {
263 264 265
			zstream->next_out = compress_next_page(dst);
			if (IS_ERR(zstream->next_out))
				return PTR_ERR(zstream->next_out);
266 267 268 269

			zstream->avail_out = PAGE_SIZE;
		}

270
		if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK)
271
			return -EIO;
C
Chris Wilson 已提交
272 273

		touch_nmi_watchdog();
274 275 276 277 278 279 280 281 282
	} while (zstream->avail_in);

	/* Fallback to uncompressed if we increase size? */
	if (0 && zstream->total_out > zstream->total_in)
		return -E2BIG;

	return 0;
}

283
static int compress_flush(struct compress *c,
284 285
			  struct drm_i915_error_object *dst)
{
286 287
	struct z_stream_s *zstream = &c->zstream;

288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315
	do {
		switch (zlib_deflate(zstream, Z_FINISH)) {
		case Z_OK: /* more space requested */
			zstream->next_out = compress_next_page(dst);
			if (IS_ERR(zstream->next_out))
				return PTR_ERR(zstream->next_out);

			zstream->avail_out = PAGE_SIZE;
			break;

		case Z_STREAM_END:
			goto end;

		default: /* any error */
			return -EIO;
		}
	} while (1);

end:
	memset(zstream->next_out, 0, zstream->avail_out);
	dst->unused = zstream->avail_out;
	return 0;
}

static void compress_fini(struct compress *c,
			  struct drm_i915_error_object *dst)
{
	struct z_stream_s *zstream = &c->zstream;
316 317 318

	zlib_deflateEnd(zstream);
	kfree(zstream->workspace);
319 320
	if (c->tmp)
		free_page((unsigned long)c->tmp);
321 322 323 324 325 326 327 328 329
}

static void err_compression_marker(struct drm_i915_error_state_buf *m)
{
	err_puts(m, ":");
}

#else

330 331 332 333
struct compress {
};

static bool compress_init(struct compress *c)
334 335 336 337
{
	return true;
}

338
static int compress_page(struct compress *c,
339 340 341 342
			 void *src,
			 struct drm_i915_error_object *dst)
{
	unsigned long page;
343
	void *ptr;
344 345 346 347 348

	page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
	if (!page)
		return -ENOMEM;

349 350 351 352
	ptr = (void *)page;
	if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE))
		memcpy(ptr, src, PAGE_SIZE);
	dst->pages[dst->page_count++] = ptr;
353 354 355 356

	return 0;
}

357 358 359 360 361 362
static int compress_flush(struct compress *c,
			  struct drm_i915_error_object *dst)
{
	return 0;
}

363
static void compress_fini(struct compress *c,
364 365 366 367 368 369 370 371 372 373 374
			  struct drm_i915_error_object *dst)
{
}

static void err_compression_marker(struct drm_i915_error_state_buf *m)
{
	err_puts(m, "~");
}

#endif

375 376 377 378 379
static void print_error_buffers(struct drm_i915_error_state_buf *m,
				const char *name,
				struct drm_i915_error_buffer *err,
				int count)
{
380
	err_printf(m, "%s [%d]:\n", name, count);
381 382

	while (count--) {
383
		err_printf(m, "    %08x_%08x %8u %02x %02x %02x",
384 385
			   upper_32_bits(err->gtt_offset),
			   lower_32_bits(err->gtt_offset),
386 387
			   err->size,
			   err->read_domains,
388 389
			   err->write_domain,
			   err->wseqno);
390 391 392
		err_puts(m, tiling_flag(err->tiling));
		err_puts(m, dirty_flag(err->dirty));
		err_puts(m, purgeable_flag(err->purgeable));
393
		err_puts(m, err->userptr ? " userptr" : "");
394
		err_puts(m, err->engine != -1 ? " " : "");
395
		err_puts(m, engine_name(m->i915, err->engine));
396
		err_puts(m, i915_cache_level_str(m->i915, err->cache_level));
397 398 399 400 401 402 403 404 405 406 407

		if (err->name)
			err_printf(m, " (name: %d)", err->name);
		if (err->fence_reg != I915_FENCE_REG_NONE)
			err_printf(m, " (fence: %d)", err->fence_reg);

		err_puts(m, "\n");
		err++;
	}
}

408
static void error_print_instdone(struct drm_i915_error_state_buf *m,
409
				 const struct drm_i915_error_engine *ee)
410
{
411 412 413
	int slice;
	int subslice;

414 415 416 417 418 419 420 421 422 423 424 425
	err_printf(m, "  INSTDONE: 0x%08x\n",
		   ee->instdone.instdone);

	if (ee->engine_id != RCS || INTEL_GEN(m->i915) <= 3)
		return;

	err_printf(m, "  SC_INSTDONE: 0x%08x\n",
		   ee->instdone.slice_common);

	if (INTEL_GEN(m->i915) <= 6)
		return;

426 427 428 429 430 431 432 433 434
	for_each_instdone_slice_subslice(m->i915, slice, subslice)
		err_printf(m, "  SAMPLER_INSTDONE[%d][%d]: 0x%08x\n",
			   slice, subslice,
			   ee->instdone.sampler[slice][subslice]);

	for_each_instdone_slice_subslice(m->i915, slice, subslice)
		err_printf(m, "  ROW_INSTDONE[%d][%d]: 0x%08x\n",
			   slice, subslice,
			   ee->instdone.row[slice][subslice]);
435 436
}

437 438 439 440 441
static const char *bannable(const struct drm_i915_error_context *ctx)
{
	return ctx->bannable ? "" : " (unbannable)";
}

442 443
static void error_print_request(struct drm_i915_error_state_buf *m,
				const char *prefix,
444 445
				const struct drm_i915_error_request *erq,
				const unsigned long epoch)
446 447 448 449
{
	if (!erq->seqno)
		return;

450
	err_printf(m, "%s pid %d, ban score %d, seqno %8x:%08x, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
451
		   prefix, erq->pid, erq->ban_score,
452
		   erq->context, erq->seqno, erq->sched_attr.priority,
453
		   jiffies_to_msecs(erq->jiffies - epoch),
454
		   erq->start, erq->head, erq->tail);
455 456
}

457 458
static void error_print_context(struct drm_i915_error_state_buf *m,
				const char *header,
459
				const struct drm_i915_error_context *ctx)
460
{
461
	err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score %d%s guilty %d active %d\n",
462
		   header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
463
		   ctx->sched_attr.priority, ctx->ban_score, bannable(ctx),
464
		   ctx->guilty, ctx->active);
465 466
}

467
static void error_print_engine(struct drm_i915_error_state_buf *m,
468 469
			       const struct drm_i915_error_engine *ee,
			       const unsigned long epoch)
470
{
471 472
	int n;

473 474
	err_printf(m, "%s command stream:\n",
		   engine_name(m->i915, ee->engine_id));
475
	err_printf(m, "  IDLE?: %s\n", yesno(ee->idle));
476
	err_printf(m, "  START: 0x%08x\n", ee->start);
477
	err_printf(m, "  HEAD:  0x%08x [0x%08x]\n", ee->head, ee->rq_head);
478 479
	err_printf(m, "  TAIL:  0x%08x [0x%08x, 0x%08x]\n",
		   ee->tail, ee->rq_post, ee->rq_tail);
480
	err_printf(m, "  CTL:   0x%08x\n", ee->ctl);
481
	err_printf(m, "  MODE:  0x%08x\n", ee->mode);
482 483 484 485 486
	err_printf(m, "  HWS:   0x%08x\n", ee->hws);
	err_printf(m, "  ACTHD: 0x%08x %08x\n",
		   (u32)(ee->acthd>>32), (u32)ee->acthd);
	err_printf(m, "  IPEIR: 0x%08x\n", ee->ipeir);
	err_printf(m, "  IPEHR: 0x%08x\n", ee->ipehr);
487 488 489

	error_print_instdone(m, ee);

490 491 492 493 494 495 496 497
	if (ee->batchbuffer) {
		u64 start = ee->batchbuffer->gtt_offset;
		u64 end = start + ee->batchbuffer->gtt_size;

		err_printf(m, "  batch: [0x%08x_%08x, 0x%08x_%08x]\n",
			   upper_32_bits(start), lower_32_bits(start),
			   upper_32_bits(end), lower_32_bits(end));
	}
498
	if (INTEL_GEN(m->i915) >= 4) {
499
		err_printf(m, "  BBADDR: 0x%08x_%08x\n",
500 501 502
			   (u32)(ee->bbaddr>>32), (u32)ee->bbaddr);
		err_printf(m, "  BB_STATE: 0x%08x\n", ee->bbstate);
		err_printf(m, "  INSTPS: 0x%08x\n", ee->instps);
503
	}
504 505 506 507 508 509
	err_printf(m, "  INSTPM: 0x%08x\n", ee->instpm);
	err_printf(m, "  FADDR: 0x%08x %08x\n", upper_32_bits(ee->faddr),
		   lower_32_bits(ee->faddr));
	if (INTEL_GEN(m->i915) >= 6) {
		err_printf(m, "  RC PSMI: 0x%08x\n", ee->rc_psmi);
		err_printf(m, "  FAULT_REG: 0x%08x\n", ee->fault_reg);
510 511 512 513 514 515 516
		err_printf(m, "  SYNC_0: 0x%08x\n",
			   ee->semaphore_mboxes[0]);
		err_printf(m, "  SYNC_1: 0x%08x\n",
			   ee->semaphore_mboxes[1]);
		if (HAS_VEBOX(m->i915))
			err_printf(m, "  SYNC_2: 0x%08x\n",
				   ee->semaphore_mboxes[2]);
517
	}
518
	if (HAS_PPGTT(m->i915)) {
519
		err_printf(m, "  GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode);
520

521
		if (INTEL_GEN(m->i915) >= 8) {
522 523 524
			int i;
			for (i = 0; i < 4; i++)
				err_printf(m, "  PDP%d: 0x%016llx\n",
525
					   i, ee->vm_info.pdp[i]);
526 527
		} else {
			err_printf(m, "  PP_DIR_BASE: 0x%08x\n",
528
				   ee->vm_info.pp_dir_base);
529 530
		}
	}
531 532 533 534 535
	err_printf(m, "  seqno: 0x%08x\n", ee->seqno);
	err_printf(m, "  last_seqno: 0x%08x\n", ee->last_seqno);
	err_printf(m, "  waiting: %s\n", yesno(ee->waiting));
	err_printf(m, "  ring->head: 0x%08x\n", ee->cpu_ring_head);
	err_printf(m, "  ring->tail: 0x%08x\n", ee->cpu_ring_tail);
536 537 538
	err_printf(m, "  hangcheck stall: %s\n", yesno(ee->hangcheck_stalled));
	err_printf(m, "  hangcheck action: %s\n",
		   hangcheck_action_to_str(ee->hangcheck_action));
539 540
	err_printf(m, "  hangcheck action timestamp: %dms (%lu%s)\n",
		   jiffies_to_msecs(ee->hangcheck_timestamp - epoch),
541
		   ee->hangcheck_timestamp,
542
		   ee->hangcheck_timestamp == epoch ? "; epoch" : "");
543
	err_printf(m, "  engine reset count: %u\n", ee->reset_count);
544

545 546
	for (n = 0; n < ee->num_ports; n++) {
		err_printf(m, "  ELSP[%d]:", n);
547
		error_print_request(m, " ", &ee->execlist[n], epoch);
548 549
	}

550
	error_print_context(m, "  Active context: ", &ee->context);
551 552 553 554 555 556 557 558 559 560 561
}

void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
{
	va_list args;

	va_start(args, f);
	i915_error_vprintf(e, f, args);
	va_end(args);
}

562
static void print_error_obj(struct drm_i915_error_state_buf *m,
563 564
			    struct intel_engine_cs *engine,
			    const char *name,
565 566
			    struct drm_i915_error_object *obj)
{
567
	char out[ASCII85_BUFSZ];
568
	int page;
569

570 571 572 573 574 575 576 577 578 579
	if (!obj)
		return;

	if (name) {
		err_printf(m, "%s --- %s = 0x%08x %08x\n",
			   engine ? engine->name : "global", name,
			   upper_32_bits(obj->gtt_offset),
			   lower_32_bits(obj->gtt_offset));
	}

580 581 582 583 584 585 586 587 588
	err_compression_marker(m);
	for (page = 0; page < obj->page_count; page++) {
		int i, len;

		len = PAGE_SIZE;
		if (page == obj->page_count - 1)
			len -= obj->unused;
		len = ascii85_encode_len(len);

589 590
		for (i = 0; i < len; i++)
			err_puts(m, ascii85_encode(obj->pages[page][i], out));
591
	}
592
	err_puts(m, "\n");
593 594
}

595
static void err_print_capabilities(struct drm_i915_error_state_buf *m,
596 597
				   const struct intel_device_info *info,
				   const struct intel_driver_caps *caps)
598
{
599 600 601
	struct drm_printer p = i915_error_printer(m);

	intel_device_info_dump_flags(info, &p);
602
	intel_driver_caps_print(caps, &p);
603
	intel_device_info_dump_topology(&info->sseu, &p);
604 605
}

606
static void err_print_params(struct drm_i915_error_state_buf *m,
607
			     const struct i915_params *params)
608
{
609 610 611
	struct drm_printer p = i915_error_printer(m);

	i915_params_dump(params, &p);
612 613
}

614 615 616 617 618 619 620 621 622 623 624 625
static void err_print_pciid(struct drm_i915_error_state_buf *m,
			    struct drm_i915_private *i915)
{
	struct pci_dev *pdev = i915->drm.pdev;

	err_printf(m, "PCI ID: 0x%04x\n", pdev->device);
	err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision);
	err_printf(m, "PCI Subsystem: %04x:%04x\n",
		   pdev->subsystem_vendor,
		   pdev->subsystem_device);
}

626 627 628 629 630 631 632 633 634 635 636 637
static void err_print_uc(struct drm_i915_error_state_buf *m,
			 const struct i915_error_uc *error_uc)
{
	struct drm_printer p = i915_error_printer(m);
	const struct i915_gpu_state *error =
		container_of(error_uc, typeof(*error), uc);

	if (!error->device_info.has_guc)
		return;

	intel_uc_fw_dump(&error_uc->guc_fw, &p);
	intel_uc_fw_dump(&error_uc->huc_fw, &p);
638
	print_error_obj(m, NULL, "GuC log buffer", error_uc->guc_log);
639 640
}

C
Chris Wilson 已提交
641
static void err_free_sgl(struct scatterlist *sgl)
642
{
C
Chris Wilson 已提交
643 644
	while (sgl) {
		struct scatterlist *sg;
645

C
Chris Wilson 已提交
646 647 648 649 650 651 652 653 654
		for (sg = sgl; !sg_is_chain(sg); sg++) {
			kfree(sg_virt(sg));
			if (sg_is_last(sg))
				break;
		}

		sg = sg_is_last(sg) ? NULL : sg_chain_ptr(sg);
		free_page((unsigned long)sgl);
		sgl = sg;
655
	}
C
Chris Wilson 已提交
656
}
657

C
Chris Wilson 已提交
658 659 660 661 662 663
static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
			       struct i915_gpu_state *error)
{
	struct drm_i915_error_object *obj;
	struct timespec64 ts;
	int i, j;
664

665 666
	if (*error->error_msg)
		err_printf(m, "%s\n", error->error_msg);
667
	err_printf(m, "Kernel: %s\n", init_utsname()->release);
A
Arnd Bergmann 已提交
668 669 670 671 672 673 674 675 676
	ts = ktime_to_timespec64(error->time);
	err_printf(m, "Time: %lld s %ld us\n",
		   (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
	ts = ktime_to_timespec64(error->boottime);
	err_printf(m, "Boottime: %lld s %ld us\n",
		   (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
	ts = ktime_to_timespec64(error->uptime);
	err_printf(m, "Uptime: %lld s %ld us\n",
		   (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
677 678 679 680 681
	err_printf(m, "Epoch: %lu jiffies (%u HZ)\n", error->epoch, HZ);
	err_printf(m, "Capture: %lu jiffies; %d ms ago, %d ms after epoch\n",
		   error->capture,
		   jiffies_to_msecs(jiffies - error->capture),
		   jiffies_to_msecs(error->capture - error->epoch));
682

683
	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
684
		if (error->engine[i].hangcheck_stalled &&
685
		    error->engine[i].context.pid) {
686
			err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
687
				   engine_name(m->i915, i),
688 689
				   error->engine[i].context.comm,
				   error->engine[i].context.pid,
690 691
				   error->engine[i].context.ban_score,
				   bannable(&error->engine[i].context));
692 693
		}
	}
694
	err_printf(m, "Reset count: %u\n", error->reset_count);
695
	err_printf(m, "Suspend count: %u\n", error->suspend_count);
696
	err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform));
C
Chris Wilson 已提交
697
	err_print_pciid(m, m->i915);
698

699
	err_printf(m, "IOMMU enabled?: %d\n", error->iommu);
700

C
Chris Wilson 已提交
701 702
	if (HAS_CSR(m->i915)) {
		struct intel_csr *csr = &m->i915->csr;
703 704 705 706 707 708 709 710

		err_printf(m, "DMC loaded: %s\n",
			   yesno(csr->dmc_payload != NULL));
		err_printf(m, "DMC fw version: %d.%d\n",
			   CSR_VERSION_MAJOR(csr->version),
			   CSR_VERSION_MINOR(csr->version));
	}

711
	err_printf(m, "GT awake: %s\n", yesno(error->awake));
712 713
	err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock));
	err_printf(m, "PM suspended: %s\n", yesno(error->suspended));
714 715
	err_printf(m, "EIR: 0x%08x\n", error->eir);
	err_printf(m, "IER: 0x%08x\n", error->ier);
716 717
	for (i = 0; i < error->ngtier; i++)
		err_printf(m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]);
718 719 720 721
	err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er);
	err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
	err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
	err_printf(m, "CCID: 0x%08x\n", error->ccid);
C
Chris Wilson 已提交
722 723
	err_printf(m, "Missed interrupts: 0x%08lx\n",
		   m->i915->gpu_error.missed_irq_rings);
724

725
	for (i = 0; i < error->nfence; i++)
726 727
		err_printf(m, "  fence[%d] = %08llx\n", i, error->fence[i]);

C
Chris Wilson 已提交
728
	if (INTEL_GEN(m->i915) >= 6) {
729
		err_printf(m, "ERROR: 0x%08x\n", error->error);
730

C
Chris Wilson 已提交
731
		if (INTEL_GEN(m->i915) >= 8)
732 733 734
			err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n",
				   error->fault_data1, error->fault_data0);

735 736 737
		err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg);
	}

C
Chris Wilson 已提交
738
	if (IS_GEN7(m->i915))
739 740
		err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);

741 742
	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
		if (error->engine[i].engine_id != -1)
743
			error_print_engine(m, &error->engine[i], error->epoch);
744
	}
745

746 747 748
	for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) {
		char buf[128];
		int len, first = 1;
749

750 751 752 753 754 755 756 757 758 759
		if (!error->active_vm[i])
			break;

		len = scnprintf(buf, sizeof(buf), "Active (");
		for (j = 0; j < ARRAY_SIZE(error->engine); j++) {
			if (error->engine[j].vm != error->active_vm[i])
				continue;

			len += scnprintf(buf + len, sizeof(buf), "%s%s",
					 first ? "" : ", ",
C
Chris Wilson 已提交
760
					 m->i915->engine[j]->name);
761 762 763 764
			first = 0;
		}
		scnprintf(buf + len, sizeof(buf), ")");
		print_error_buffers(m, buf,
765 766 767
				    error->active_bo[i],
				    error->active_bo_count[i]);
	}
768

769 770 771 772
	print_error_buffers(m, "Pinned (global)",
			    error->pinned_bo,
			    error->pinned_bo_count);

773
	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
774
		const struct drm_i915_error_engine *ee = &error->engine[i];
775 776

		obj = ee->batchbuffer;
777
		if (obj) {
C
Chris Wilson 已提交
778
			err_puts(m, m->i915->engine[i]->name);
779
			if (ee->context.pid)
780
				err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d%s)",
781 782 783 784
					   ee->context.comm,
					   ee->context.pid,
					   ee->context.handle,
					   ee->context.hw_id,
785 786
					   ee->context.ban_score,
					   bannable(&ee->context));
787 788 789
			err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
				   upper_32_bits(obj->gtt_offset),
				   lower_32_bits(obj->gtt_offset));
C
Chris Wilson 已提交
790
			print_error_obj(m, m->i915->engine[i], NULL, obj);
791 792
		}

793
		for (j = 0; j < ee->user_bo_count; j++)
C
Chris Wilson 已提交
794
			print_error_obj(m, m->i915->engine[i],
795 796
					"user", ee->user_bo[j]);

797
		if (ee->num_requests) {
798
			err_printf(m, "%s --- %d requests\n",
C
Chris Wilson 已提交
799
				   m->i915->engine[i]->name,
800
				   ee->num_requests);
801
			for (j = 0; j < ee->num_requests; j++)
802 803 804
				error_print_request(m, " ",
						    &ee->requests[j],
						    error->epoch);
805 806
		}

807 808
		if (IS_ERR(ee->waiters)) {
			err_printf(m, "%s --- ? waiters [unable to acquire spinlock]\n",
C
Chris Wilson 已提交
809
				   m->i915->engine[i]->name);
810
		} else if (ee->num_waiters) {
811
			err_printf(m, "%s --- %d waiters\n",
C
Chris Wilson 已提交
812
				   m->i915->engine[i]->name,
813 814
				   ee->num_waiters);
			for (j = 0; j < ee->num_waiters; j++) {
815
				err_printf(m, " seqno 0x%08x for %s [%d]\n",
816 817 818
					   ee->waiters[j].seqno,
					   ee->waiters[j].comm,
					   ee->waiters[j].pid);
819 820 821
			}
		}

C
Chris Wilson 已提交
822
		print_error_obj(m, m->i915->engine[i],
823
				"ringbuffer", ee->ringbuffer);
824

C
Chris Wilson 已提交
825
		print_error_obj(m, m->i915->engine[i],
826
				"HW Status", ee->hws_page);
827

C
Chris Wilson 已提交
828
		print_error_obj(m, m->i915->engine[i],
829
				"HW context", ee->ctx);
830

C
Chris Wilson 已提交
831
		print_error_obj(m, m->i915->engine[i],
832
				"WA context", ee->wa_ctx);
833

C
Chris Wilson 已提交
834
		print_error_obj(m, m->i915->engine[i],
835
				"WA batchbuffer", ee->wa_batchbuffer);
836

C
Chris Wilson 已提交
837
		print_error_obj(m, m->i915->engine[i],
838
				"NULL context", ee->default_state);
839 840 841 842 843 844
	}

	if (error->overlay)
		intel_overlay_print_error_state(m, error->overlay);

	if (error->display)
845
		intel_display_print_error_state(m, error->display);
846

847
	err_print_capabilities(m, &error->device_info, &error->driver_caps);
848
	err_print_params(m, &error->params);
849
	err_print_uc(m, &error->uc);
C
Chris Wilson 已提交
850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881
}

static int err_print_to_sgl(struct i915_gpu_state *error)
{
	struct drm_i915_error_state_buf m;

	if (IS_ERR(error))
		return PTR_ERR(error);

	if (READ_ONCE(error->sgl))
		return 0;

	memset(&m, 0, sizeof(m));
	m.i915 = error->i915;

	__err_print_to_sgl(&m, error);

	if (m.buf) {
		__sg_set_buf(m.cur++, m.buf, m.bytes, m.iter);
		m.bytes = 0;
		m.buf = NULL;
	}
	if (m.cur) {
		GEM_BUG_ON(m.end < m.cur);
		sg_mark_end(m.cur - 1);
	}
	GEM_BUG_ON(m.sgl && !m.cur);

	if (m.err) {
		err_free_sgl(m.sgl);
		return m.err;
	}
882

C
Chris Wilson 已提交
883 884
	if (cmpxchg(&error->sgl, NULL, m.sgl))
		err_free_sgl(m.sgl);
885 886 887 888

	return 0;
}

C
Chris Wilson 已提交
889 890
ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error,
				      char *buf, loff_t off, size_t rem)
891
{
C
Chris Wilson 已提交
892 893 894 895
	struct scatterlist *sg;
	size_t count;
	loff_t pos;
	int err;
896

C
Chris Wilson 已提交
897 898
	if (!error || !rem)
		return 0;
899

C
Chris Wilson 已提交
900 901 902
	err = err_print_to_sgl(error);
	if (err)
		return err;
903

C
Chris Wilson 已提交
904 905 906 907 908
	sg = READ_ONCE(error->fit);
	if (!sg || off < sg->dma_address)
		sg = error->sgl;
	if (!sg)
		return 0;
909

C
Chris Wilson 已提交
910 911 912 913 914 915 916 917 918
	pos = sg->dma_address;
	count = 0;
	do {
		size_t len, start;

		if (sg_is_chain(sg)) {
			sg = sg_chain_ptr(sg);
			GEM_BUG_ON(sg_is_chain(sg));
		}
919

C
Chris Wilson 已提交
920 921 922 923 924
		len = sg->length;
		if (pos + len <= off) {
			pos += len;
			continue;
		}
925

C
Chris Wilson 已提交
926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950
		start = sg->offset;
		if (pos < off) {
			GEM_BUG_ON(off - pos > len);
			len -= off - pos;
			start += off - pos;
			pos = off;
		}

		len = min(len, rem);
		GEM_BUG_ON(!len || len > sg->length);

		memcpy(buf, page_address(sg_page(sg)) + start, len);

		count += len;
		pos += len;

		buf += len;
		rem -= len;
		if (!rem) {
			WRITE_ONCE(error->fit, sg);
			break;
		}
	} while (!sg_is_last(sg++));

	return count;
951 952 953 954 955 956 957 958 959 960
}

static void i915_error_object_free(struct drm_i915_error_object *obj)
{
	int page;

	if (obj == NULL)
		return;

	for (page = 0; page < obj->page_count; page++)
961
		free_page((unsigned long)obj->pages[page]);
962 963 964 965

	kfree(obj);
}

966 967 968 969 970 971
static __always_inline void free_param(const char *type, void *x)
{
	if (!__builtin_strcmp(type, "char *"))
		kfree(*(void **)x);
}

972 973 974 975 976 977 978
static void cleanup_params(struct i915_gpu_state *error)
{
#define FREE(T, x, ...) free_param(#T, &error->params.x);
	I915_PARAMS_FOR_EACH(FREE);
#undef FREE
}

979 980 981 982 983 984
static void cleanup_uc_state(struct i915_gpu_state *error)
{
	struct i915_error_uc *error_uc = &error->uc;

	kfree(error_uc->guc_fw.path);
	kfree(error_uc->huc_fw.path);
985
	i915_error_object_free(error_uc->guc_log);
986 987
}

988
void __i915_gpu_state_free(struct kref *error_ref)
989
{
990 991
	struct i915_gpu_state *error =
		container_of(error_ref, typeof(*error), ref);
992
	long i, j;
993

994 995 996
	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
		struct drm_i915_error_engine *ee = &error->engine[i];

997 998 999 1000
		for (j = 0; j < ee->user_bo_count; j++)
			i915_error_object_free(ee->user_bo[j]);
		kfree(ee->user_bo);

1001 1002 1003 1004 1005 1006 1007 1008
		i915_error_object_free(ee->batchbuffer);
		i915_error_object_free(ee->wa_batchbuffer);
		i915_error_object_free(ee->ringbuffer);
		i915_error_object_free(ee->hws_page);
		i915_error_object_free(ee->ctx);
		i915_error_object_free(ee->wa_ctx);

		kfree(ee->requests);
1009 1010
		if (!IS_ERR_OR_NULL(ee->waiters))
			kfree(ee->waiters);
1011 1012
	}

1013
	for (i = 0; i < ARRAY_SIZE(error->active_bo); i++)
1014 1015
		kfree(error->active_bo[i]);
	kfree(error->pinned_bo);
1016

1017 1018
	kfree(error->overlay);
	kfree(error->display);
1019

1020
	cleanup_params(error);
1021 1022
	cleanup_uc_state(error);

C
Chris Wilson 已提交
1023
	err_free_sgl(error->sgl);
1024 1025 1026 1027
	kfree(error);
}

static struct drm_i915_error_object *
1028
i915_error_object_create(struct drm_i915_private *i915,
C
Chris Wilson 已提交
1029
			 struct i915_vma *vma)
1030
{
1031 1032
	struct i915_ggtt *ggtt = &i915->ggtt;
	const u64 slot = ggtt->error_capture.start;
1033
	struct drm_i915_error_object *dst;
1034
	struct compress compress;
1035 1036 1037
	unsigned long num_pages;
	struct sgt_iter iter;
	dma_addr_t dma;
1038
	int ret;
1039

C
Chris Wilson 已提交
1040 1041 1042
	if (!vma)
		return NULL;

1043
	num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
1044
	num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
1045 1046
	dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *),
		      GFP_ATOMIC | __GFP_NOWARN);
C
Chris Wilson 已提交
1047
	if (!dst)
1048 1049
		return NULL;

1050 1051
	dst->gtt_offset = vma->node.start;
	dst->gtt_size = vma->node.size;
1052
	dst->num_pages = num_pages;
1053
	dst->page_count = 0;
1054 1055
	dst->unused = 0;

1056
	if (!compress_init(&compress)) {
1057 1058 1059
		kfree(dst);
		return NULL;
	}
1060

1061
	ret = -EINVAL;
1062 1063
	for_each_sgt_dma(dma, iter, vma->pages) {
		void __iomem *s;
1064

1065
		ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0);
1066

1067
		s = io_mapping_map_atomic_wc(&ggtt->iomap, slot);
1068
		ret = compress_page(&compress, (void  __force *)s, dst);
1069 1070
		io_mapping_unmap_atomic(s);
		if (ret)
1071
			break;
1072 1073
	}

1074 1075 1076 1077 1078 1079
	if (ret || compress_flush(&compress, dst)) {
		while (dst->page_count--)
			free_page((unsigned long)dst->pages[dst->page_count]);
		kfree(dst);
		dst = NULL;
	}
1080

1081
	compress_fini(&compress, dst);
1082
	return dst;
1083 1084
}

1085 1086 1087 1088 1089 1090
/* The error capture is special as tries to run underneath the normal
 * locking rules - so we use the raw version of the i915_gem_active lookup.
 */
static inline uint32_t
__active_get_seqno(struct i915_gem_active *active)
{
1091
	struct i915_request *request;
1092 1093 1094

	request = __i915_gem_active_peek(active);
	return request ? request->global_seqno : 0;
1095 1096 1097 1098 1099
}

static inline int
__active_get_engine_id(struct i915_gem_active *active)
{
1100
	struct i915_request *request;
1101

1102 1103
	request = __i915_gem_active_peek(active);
	return request ? request->engine->id : -1;
1104 1105
}

1106
static void capture_bo(struct drm_i915_error_buffer *err,
1107
		       struct i915_vma *vma)
1108
{
1109 1110
	struct drm_i915_gem_object *obj = vma->obj;

1111 1112
	err->size = obj->base.size;
	err->name = obj->base.name;
1113

1114 1115
	err->wseqno = __active_get_seqno(&obj->frontbuffer_write);
	err->engine = __active_get_engine_id(&obj->frontbuffer_write);
1116

1117
	err->gtt_offset = vma->node.start;
1118 1119
	err->read_domains = obj->read_domains;
	err->write_domain = obj->write_domain;
1120
	err->fence_reg = vma->fence ? vma->fence->id : -1;
1121
	err->tiling = i915_gem_object_get_tiling(obj);
C
Chris Wilson 已提交
1122 1123
	err->dirty = obj->mm.dirty;
	err->purgeable = obj->mm.madv != I915_MADV_WILLNEED;
1124
	err->userptr = obj->userptr.mm != NULL;
1125 1126 1127
	err->cache_level = obj->cache_level;
}

1128 1129 1130
static u32 capture_error_bo(struct drm_i915_error_buffer *err,
			    int count, struct list_head *head,
			    bool pinned_only)
1131
{
B
Ben Widawsky 已提交
1132
	struct i915_vma *vma;
1133 1134
	int i = 0;

1135
	list_for_each_entry(vma, head, vm_link) {
1136 1137 1138
		if (!vma->obj)
			continue;

1139 1140 1141
		if (pinned_only && !i915_vma_is_pinned(vma))
			continue;

1142
		capture_bo(err++, vma);
1143 1144 1145 1146 1147 1148 1149
		if (++i == count)
			break;
	}

	return i;
}

1150 1151 1152 1153 1154 1155 1156 1157 1158 1159
/* Generate a semi-unique error code. The code is not meant to have meaning, The
 * code's only purpose is to try to prevent false duplicated bug reports by
 * grossly estimating a GPU error state.
 *
 * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
 * the hang if we could strip the GTT offset information from it.
 *
 * It's only a small step better than a random number in its current form.
 */
static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
1160
					 struct i915_gpu_state *error,
1161
					 int *engine_id)
1162 1163 1164 1165 1166 1167 1168 1169 1170
{
	uint32_t error_code = 0;
	int i;

	/* IPEHR would be an ideal way to detect errors, as it's the gross
	 * measure of "the command that hung." However, has some very common
	 * synchronization commands which almost always appear in the case
	 * strictly a client bug. Use instdone to differentiate those some.
	 */
1171
	for (i = 0; i < I915_NUM_ENGINES; i++) {
1172
		if (error->engine[i].hangcheck_stalled) {
1173 1174
			if (engine_id)
				*engine_id = i;
1175

1176 1177
			return error->engine[i].ipehr ^
			       error->engine[i].instdone.instdone;
1178 1179
		}
	}
1180 1181 1182 1183

	return error_code;
}

1184
static void gem_record_fences(struct i915_gpu_state *error)
1185
{
1186
	struct drm_i915_private *dev_priv = error->i915;
1187 1188
	int i;

1189
	if (INTEL_GEN(dev_priv) >= 6) {
1190
		for (i = 0; i < dev_priv->num_fence_regs; i++)
1191 1192
			error->fence[i] = I915_READ64(FENCE_REG_GEN6_LO(i));
	} else if (INTEL_GEN(dev_priv) >= 4) {
1193 1194
		for (i = 0; i < dev_priv->num_fence_regs; i++)
			error->fence[i] = I915_READ64(FENCE_REG_965_LO(i));
1195
	} else {
1196
		for (i = 0; i < dev_priv->num_fence_regs; i++)
1197
			error->fence[i] = I915_READ(FENCE_REG(i));
1198
	}
1199
	error->nfence = i;
1200 1201
}

1202 1203
static void gen6_record_semaphore_state(struct intel_engine_cs *engine,
					struct drm_i915_error_engine *ee)
1204
{
1205 1206 1207 1208
	struct drm_i915_private *dev_priv = engine->i915;

	ee->semaphore_mboxes[0] = I915_READ(RING_SYNC_0(engine->mmio_base));
	ee->semaphore_mboxes[1] = I915_READ(RING_SYNC_1(engine->mmio_base));
1209
	if (HAS_VEBOX(dev_priv))
1210
		ee->semaphore_mboxes[2] =
1211
			I915_READ(RING_SYNC_2(engine->mmio_base));
1212 1213
}

1214 1215
static void error_record_engine_waiters(struct intel_engine_cs *engine,
					struct drm_i915_error_engine *ee)
1216 1217 1218 1219 1220 1221
{
	struct intel_breadcrumbs *b = &engine->breadcrumbs;
	struct drm_i915_error_waiter *waiter;
	struct rb_node *rb;
	int count;

1222 1223
	ee->num_waiters = 0;
	ee->waiters = NULL;
1224

1225 1226 1227
	if (RB_EMPTY_ROOT(&b->waiters))
		return;

1228
	if (!spin_trylock_irq(&b->rb_lock)) {
1229 1230 1231 1232
		ee->waiters = ERR_PTR(-EDEADLK);
		return;
	}

1233 1234 1235
	count = 0;
	for (rb = rb_first(&b->waiters); rb != NULL; rb = rb_next(rb))
		count++;
1236
	spin_unlock_irq(&b->rb_lock);
1237 1238 1239 1240 1241 1242 1243 1244 1245

	waiter = NULL;
	if (count)
		waiter = kmalloc_array(count,
				       sizeof(struct drm_i915_error_waiter),
				       GFP_ATOMIC);
	if (!waiter)
		return;

1246
	if (!spin_trylock_irq(&b->rb_lock)) {
1247 1248 1249 1250
		kfree(waiter);
		ee->waiters = ERR_PTR(-EDEADLK);
		return;
	}
1251

1252
	ee->waiters = waiter;
1253
	for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
G
Geliang Tang 已提交
1254
		struct intel_wait *w = rb_entry(rb, typeof(*w), node);
1255 1256 1257 1258 1259 1260

		strcpy(waiter->comm, w->tsk->comm);
		waiter->pid = w->tsk->pid;
		waiter->seqno = w->seqno;
		waiter++;

1261
		if (++ee->num_waiters == count)
1262 1263
			break;
	}
1264
	spin_unlock_irq(&b->rb_lock);
1265 1266
}

1267
static void error_record_engine_registers(struct i915_gpu_state *error,
1268 1269
					  struct intel_engine_cs *engine,
					  struct drm_i915_error_engine *ee)
1270
{
1271 1272
	struct drm_i915_private *dev_priv = engine->i915;

1273
	if (INTEL_GEN(dev_priv) >= 6) {
1274
		ee->rc_psmi = I915_READ(RING_PSMI_CTL(engine->mmio_base));
1275 1276 1277
		if (INTEL_GEN(dev_priv) >= 8) {
			ee->fault_reg = I915_READ(GEN8_RING_FAULT_REG);
		} else {
1278
			gen6_record_semaphore_state(engine, ee);
1279 1280
			ee->fault_reg = I915_READ(RING_FAULT_REG(engine));
		}
1281 1282
	}

1283
	if (INTEL_GEN(dev_priv) >= 4) {
1284 1285 1286 1287 1288
		ee->faddr = I915_READ(RING_DMA_FADD(engine->mmio_base));
		ee->ipeir = I915_READ(RING_IPEIR(engine->mmio_base));
		ee->ipehr = I915_READ(RING_IPEHR(engine->mmio_base));
		ee->instps = I915_READ(RING_INSTPS(engine->mmio_base));
		ee->bbaddr = I915_READ(RING_BBADDR(engine->mmio_base));
1289
		if (INTEL_GEN(dev_priv) >= 8) {
1290 1291
			ee->faddr |= (u64) I915_READ(RING_DMA_FADD_UDW(engine->mmio_base)) << 32;
			ee->bbaddr |= (u64) I915_READ(RING_BBADDR_UDW(engine->mmio_base)) << 32;
1292
		}
1293
		ee->bbstate = I915_READ(RING_BBSTATE(engine->mmio_base));
1294
	} else {
1295 1296 1297
		ee->faddr = I915_READ(DMA_FADD_I8XX);
		ee->ipeir = I915_READ(IPEIR);
		ee->ipehr = I915_READ(IPEHR);
1298 1299
	}

1300
	intel_engine_get_instdone(engine, &ee->instdone);
1301

1302 1303
	ee->waiting = intel_engine_has_waiter(engine);
	ee->instpm = I915_READ(RING_INSTPM(engine->mmio_base));
1304
	ee->acthd = intel_engine_get_active_head(engine);
1305
	ee->seqno = intel_engine_get_seqno(engine);
1306
	ee->last_seqno = intel_engine_last_submit(engine);
1307 1308 1309 1310
	ee->start = I915_READ_START(engine);
	ee->head = I915_READ_HEAD(engine);
	ee->tail = I915_READ_TAIL(engine);
	ee->ctl = I915_READ_CTL(engine);
1311 1312
	if (INTEL_GEN(dev_priv) > 2)
		ee->mode = I915_READ_MODE(engine);
1313

1314
	if (!HWS_NEEDS_PHYSICAL(dev_priv)) {
1315
		i915_reg_t mmio;
1316

1317
		if (IS_GEN7(dev_priv)) {
1318
			switch (engine->id) {
1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332
			default:
			case RCS:
				mmio = RENDER_HWS_PGA_GEN7;
				break;
			case BCS:
				mmio = BLT_HWS_PGA_GEN7;
				break;
			case VCS:
				mmio = BSD_HWS_PGA_GEN7;
				break;
			case VECS:
				mmio = VEBOX_HWS_PGA_GEN7;
				break;
			}
1333
		} else if (IS_GEN6(engine->i915)) {
1334
			mmio = RING_HWS_PGA_GEN6(engine->mmio_base);
1335 1336
		} else {
			/* XXX: gen8 returns to sanity */
1337
			mmio = RING_HWS_PGA(engine->mmio_base);
1338 1339
		}

1340
		ee->hws = I915_READ(mmio);
1341 1342
	}

1343
	ee->idle = intel_engine_is_idle(engine);
1344
	ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
1345
	ee->hangcheck_action = engine->hangcheck.action;
1346
	ee->hangcheck_stalled = engine->hangcheck.stalled;
1347 1348
	ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
						  engine);
1349

1350
	if (HAS_PPGTT(dev_priv)) {
1351 1352
		int i;

1353
		ee->vm_info.gfx_mode = I915_READ(RING_MODE_GEN7(engine));
1354

1355
		if (IS_GEN6(dev_priv))
1356
			ee->vm_info.pp_dir_base =
1357
				I915_READ(RING_PP_DIR_BASE_READ(engine));
1358
		else if (IS_GEN7(dev_priv))
1359
			ee->vm_info.pp_dir_base =
1360
				I915_READ(RING_PP_DIR_BASE(engine));
1361
		else if (INTEL_GEN(dev_priv) >= 8)
1362
			for (i = 0; i < 4; i++) {
1363
				ee->vm_info.pdp[i] =
1364
					I915_READ(GEN8_RING_PDP_UDW(engine, i));
1365 1366
				ee->vm_info.pdp[i] <<= 32;
				ee->vm_info.pdp[i] |=
1367
					I915_READ(GEN8_RING_PDP_LDW(engine, i));
1368 1369
			}
	}
1370 1371
}

1372
static void record_request(struct i915_request *request,
1373 1374
			   struct drm_i915_error_request *erq)
{
C
Chris Wilson 已提交
1375 1376 1377
	struct i915_gem_context *ctx = request->gem_context;

	erq->context = ctx->hw_id;
1378
	erq->sched_attr = request->sched.attr;
C
Chris Wilson 已提交
1379
	erq->ban_score = atomic_read(&ctx->ban_score);
1380
	erq->seqno = request->global_seqno;
1381
	erq->jiffies = request->emitted_jiffies;
1382
	erq->start = i915_ggtt_offset(request->ring->vma);
1383 1384 1385 1386
	erq->head = request->head;
	erq->tail = request->tail;

	rcu_read_lock();
C
Chris Wilson 已提交
1387
	erq->pid = ctx->pid ? pid_nr(ctx->pid) : 0;
1388 1389 1390
	rcu_read_unlock();
}

1391
static void engine_record_requests(struct intel_engine_cs *engine,
1392
				   struct i915_request *first,
1393 1394
				   struct drm_i915_error_engine *ee)
{
1395
	struct i915_request *request;
1396 1397 1398 1399
	int count;

	count = 0;
	request = first;
1400
	list_for_each_entry_from(request, &engine->timeline.requests, link)
1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412
		count++;
	if (!count)
		return;

	ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC);
	if (!ee->requests)
		return;

	ee->num_requests = count;

	count = 0;
	request = first;
1413
	list_for_each_entry_from(request, &engine->timeline.requests, link) {
1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432
		if (count >= ee->num_requests) {
			/*
			 * If the ring request list was changed in
			 * between the point where the error request
			 * list was created and dimensioned and this
			 * point then just exit early to avoid crashes.
			 *
			 * We don't need to communicate that the
			 * request list changed state during error
			 * state capture and that the error state is
			 * slightly incorrect as a consequence since we
			 * are typically only interested in the request
			 * list state at the point of error state
			 * capture, not in any changes happening during
			 * the capture.
			 */
			break;
		}

1433
		record_request(request, &ee->requests[count++]);
1434 1435 1436 1437
	}
	ee->num_requests = count;
}

1438 1439 1440
static void error_record_engine_execlists(struct intel_engine_cs *engine,
					  struct drm_i915_error_engine *ee)
{
1441
	const struct intel_engine_execlists * const execlists = &engine->execlists;
1442 1443
	unsigned int n;

1444
	for (n = 0; n < execlists_num_ports(execlists); n++) {
1445
		struct i915_request *rq = port_request(&execlists->port[n]);
1446 1447 1448 1449 1450 1451

		if (!rq)
			break;

		record_request(rq, &ee->execlist[n]);
	}
1452 1453

	ee->num_ports = n;
1454 1455
}

1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472
static void record_context(struct drm_i915_error_context *e,
			   struct i915_gem_context *ctx)
{
	if (ctx->pid) {
		struct task_struct *task;

		rcu_read_lock();
		task = pid_task(ctx->pid, PIDTYPE_PID);
		if (task) {
			strcpy(e->comm, task->comm);
			e->pid = task->pid;
		}
		rcu_read_unlock();
	}

	e->handle = ctx->user_handle;
	e->hw_id = ctx->hw_id;
1473
	e->sched_attr = ctx->sched;
1474
	e->ban_score = atomic_read(&ctx->ban_score);
1475
	e->bannable = i915_gem_context_is_bannable(ctx);
1476 1477
	e->guilty = atomic_read(&ctx->guilty_count);
	e->active = atomic_read(&ctx->active_count);
1478 1479
}

1480
static void request_record_user_bo(struct i915_request *request,
1481 1482
				   struct drm_i915_error_engine *ee)
{
1483
	struct i915_capture_list *c;
1484
	struct drm_i915_error_object **bo;
1485
	long count, max;
1486

1487
	max = 0;
1488
	for (c = request->capture_list; c; c = c->next)
1489 1490 1491
		max++;
	if (!max)
		return;
1492

1493 1494 1495 1496 1497 1498
	bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
	if (!bo) {
		/* If we can't capture everything, try to capture something. */
		max = min_t(long, max, PAGE_SIZE / sizeof(*bo));
		bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
	}
1499 1500 1501 1502 1503 1504 1505 1506
	if (!bo)
		return;

	count = 0;
	for (c = request->capture_list; c; c = c->next) {
		bo[count] = i915_error_object_create(request->i915, c->vma);
		if (!bo[count])
			break;
1507 1508
		if (++count == max)
			break;
1509 1510 1511 1512 1513 1514
	}

	ee->user_bo = bo;
	ee->user_bo_count = count;
}

1515 1516 1517 1518 1519 1520 1521
static struct drm_i915_error_object *
capture_object(struct drm_i915_private *dev_priv,
	       struct drm_i915_gem_object *obj)
{
	if (obj && i915_gem_object_has_pages(obj)) {
		struct i915_vma fake = {
			.node = { .start = U64_MAX, .size = obj->base.size },
1522
			.size = obj->base.size,
1523 1524 1525 1526 1527 1528 1529 1530 1531 1532
			.pages = obj->mm.pages,
			.obj = obj,
		};

		return i915_error_object_create(dev_priv, &fake);
	} else {
		return NULL;
	}
}

1533
static void gem_record_rings(struct i915_gpu_state *error)
1534
{
1535 1536
	struct drm_i915_private *i915 = error->i915;
	struct i915_ggtt *ggtt = &i915->ggtt;
1537
	int i;
1538

1539
	for (i = 0; i < I915_NUM_ENGINES; i++) {
1540
		struct intel_engine_cs *engine = i915->engine[i];
1541
		struct drm_i915_error_engine *ee = &error->engine[i];
1542
		struct i915_request *request;
1543

1544
		ee->engine_id = -1;
1545

1546
		if (!engine)
1547 1548
			continue;

1549
		ee->engine_id = i;
1550

1551 1552
		error_record_engine_registers(error, engine, ee);
		error_record_engine_waiters(engine, ee);
1553
		error_record_engine_execlists(engine, ee);
1554

1555
		request = i915_gem_find_active_request(engine);
1556
		if (request) {
C
Chris Wilson 已提交
1557
			struct i915_gem_context *ctx = request->gem_context;
1558
			struct intel_ring *ring;
1559

1560
			ee->vm = ctx->ppgtt ? &ctx->ppgtt->vm : &ggtt->vm;
1561

C
Chris Wilson 已提交
1562
			record_context(&ee->context, ctx);
1563

1564 1565 1566 1567
			/* We need to copy these to an anonymous buffer
			 * as the simplest method to avoid being overwritten
			 * by userspace.
			 */
1568
			ee->batchbuffer =
1569
				i915_error_object_create(i915, request->batch);
1570

1571
			if (HAS_BROKEN_CS_TLB(i915))
1572
				ee->wa_batchbuffer =
1573
					i915_error_object_create(i915,
1574
								 i915->gt.scratch);
1575
			request_record_user_bo(request, ee);
1576

C
Chris Wilson 已提交
1577
			ee->ctx =
1578
				i915_error_object_create(i915,
1579
							 request->hw_context->state);
1580

1581
			error->simulated |=
C
Chris Wilson 已提交
1582
				i915_gem_context_no_error_capture(ctx);
1583

1584 1585 1586 1587
			ee->rq_head = request->head;
			ee->rq_post = request->postfix;
			ee->rq_tail = request->tail;

1588 1589 1590
			ring = request->ring;
			ee->cpu_ring_head = ring->head;
			ee->cpu_ring_tail = ring->tail;
1591
			ee->ringbuffer =
1592
				i915_error_object_create(i915, ring->vma);
1593 1594

			engine_record_requests(engine, request, ee);
1595
		}
1596

1597
		ee->hws_page =
1598
			i915_error_object_create(i915,
C
Chris Wilson 已提交
1599
						 engine->status_page.vma);
1600

1601
		ee->wa_ctx = i915_error_object_create(i915, engine->wa_ctx.vma);
1602

1603
		ee->default_state = capture_object(i915, engine->default_state);
1604 1605 1606
	}
}

1607 1608 1609
static void gem_capture_vm(struct i915_gpu_state *error,
			   struct i915_address_space *vm,
			   int idx)
1610
{
1611
	struct drm_i915_error_buffer *active_bo;
1612
	struct i915_vma *vma;
1613
	int count;
1614

1615
	count = 0;
1616
	list_for_each_entry(vma, &vm->active_list, vm_link)
1617
		count++;
1618

1619 1620 1621
	active_bo = NULL;
	if (count)
		active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC);
1622
	if (active_bo)
1623 1624 1625 1626 1627 1628 1629
		count = capture_error_bo(active_bo, count, &vm->active_list, false);
	else
		count = 0;

	error->active_vm[idx] = vm;
	error->active_bo[idx] = active_bo;
	error->active_bo_count[idx] = count;
1630 1631
}

1632
static void capture_active_buffers(struct i915_gpu_state *error)
1633
{
1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646
	int cnt = 0, i, j;

	BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo));
	BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm));
	BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count));

	/* Scan each engine looking for unique active contexts/vm */
	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
		struct drm_i915_error_engine *ee = &error->engine[i];
		bool found;

		if (!ee->vm)
			continue;
1647

1648 1649 1650 1651
		found = false;
		for (j = 0; j < i && !found; j++)
			found = error->engine[j].vm == ee->vm;
		if (!found)
1652
			gem_capture_vm(error, ee->vm, cnt++);
1653
	}
1654 1655
}

1656
static void capture_pinned_buffers(struct i915_gpu_state *error)
1657
{
1658
	struct i915_address_space *vm = &error->i915->ggtt.vm;
1659 1660 1661 1662 1663
	struct drm_i915_error_buffer *bo;
	struct i915_vma *vma;
	int count_inactive, count_active;

	count_inactive = 0;
1664
	list_for_each_entry(vma, &vm->inactive_list, vm_link)
1665 1666 1667
		count_inactive++;

	count_active = 0;
1668
	list_for_each_entry(vma, &vm->active_list, vm_link)
1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685
		count_active++;

	bo = NULL;
	if (count_inactive + count_active)
		bo = kcalloc(count_inactive + count_active,
			     sizeof(*bo), GFP_ATOMIC);
	if (!bo)
		return;

	count_inactive = capture_error_bo(bo, count_inactive,
					  &vm->active_list, true);
	count_active = capture_error_bo(bo + count_inactive, count_active,
					&vm->inactive_list, true);
	error->pinned_bo_count = count_inactive + count_active;
	error->pinned_bo = bo;
}

1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703
static void capture_uc_state(struct i915_gpu_state *error)
{
	struct drm_i915_private *i915 = error->i915;
	struct i915_error_uc *error_uc = &error->uc;

	/* Capturing uC state won't be useful if there is no GuC */
	if (!error->device_info.has_guc)
		return;

	error_uc->guc_fw = i915->guc.fw;
	error_uc->huc_fw = i915->huc.fw;

	/* Non-default firmware paths will be specified by the modparam.
	 * As modparams are generally accesible from the userspace make
	 * explicit copies of the firmware paths.
	 */
	error_uc->guc_fw.path = kstrdup(i915->guc.fw.path, GFP_ATOMIC);
	error_uc->huc_fw.path = kstrdup(i915->huc.fw.path, GFP_ATOMIC);
1704
	error_uc->guc_log = i915_error_object_create(i915, i915->guc.log.vma);
1705 1706
}

1707
/* Capture all registers which don't fit into another category. */
1708
static void capture_reg_state(struct i915_gpu_state *error)
1709
{
1710
	struct drm_i915_private *dev_priv = error->i915;
1711
	int i;
1712

1713 1714 1715 1716 1717 1718 1719
	/* General organization
	 * 1. Registers specific to a single generation
	 * 2. Registers which belong to multiple generations
	 * 3. Feature specific registers.
	 * 4. Everything else
	 * Please try to follow the order.
	 */
1720

1721
	/* 1: Registers specific to a single generation */
1722
	if (IS_VALLEYVIEW(dev_priv)) {
1723
		error->gtier[0] = I915_READ(GTIER);
1724
		error->ier = I915_READ(VLV_IER);
1725
		error->forcewake = I915_READ_FW(FORCEWAKE_VLV);
1726
	}
1727

1728
	if (IS_GEN7(dev_priv))
1729
		error->err_int = I915_READ(GEN7_ERR_INT);
1730

1731
	if (INTEL_GEN(dev_priv) >= 8) {
1732 1733 1734 1735
		error->fault_data0 = I915_READ(GEN8_FAULT_TLB_DATA0);
		error->fault_data1 = I915_READ(GEN8_FAULT_TLB_DATA1);
	}

1736
	if (IS_GEN6(dev_priv)) {
1737
		error->forcewake = I915_READ_FW(FORCEWAKE);
1738 1739 1740
		error->gab_ctl = I915_READ(GAB_CTL);
		error->gfx_mode = I915_READ(GFX_MODE);
	}
1741

1742
	/* 2: Registers which belong to multiple generations */
1743
	if (INTEL_GEN(dev_priv) >= 7)
1744
		error->forcewake = I915_READ_FW(FORCEWAKE_MT);
1745

1746
	if (INTEL_GEN(dev_priv) >= 6) {
1747
		error->derrmr = I915_READ(DERRMR);
1748 1749 1750 1751
		error->error = I915_READ(ERROR_GEN6);
		error->done_reg = I915_READ(DONE_REG);
	}

J
Joonas Lahtinen 已提交
1752
	if (INTEL_GEN(dev_priv) >= 5)
1753 1754
		error->ccid = I915_READ(CCID);

1755
	/* 3: Feature specific registers */
1756
	if (IS_GEN6(dev_priv) || IS_GEN7(dev_priv)) {
1757 1758 1759 1760 1761
		error->gam_ecochk = I915_READ(GAM_ECOCHK);
		error->gac_eco = I915_READ(GAC_ECO_BITS);
	}

	/* 4: Everything else */
1762 1763 1764 1765 1766 1767 1768 1769 1770 1771
	if (INTEL_GEN(dev_priv) >= 11) {
		error->ier = I915_READ(GEN8_DE_MISC_IER);
		error->gtier[0] = I915_READ(GEN11_RENDER_COPY_INTR_ENABLE);
		error->gtier[1] = I915_READ(GEN11_VCS_VECS_INTR_ENABLE);
		error->gtier[2] = I915_READ(GEN11_GUC_SG_INTR_ENABLE);
		error->gtier[3] = I915_READ(GEN11_GPM_WGBOXPERF_INTR_ENABLE);
		error->gtier[4] = I915_READ(GEN11_CRYPTO_RSVD_INTR_ENABLE);
		error->gtier[5] = I915_READ(GEN11_GUNIT_CSME_INTR_ENABLE);
		error->ngtier = 6;
	} else if (INTEL_GEN(dev_priv) >= 8) {
1772 1773 1774
		error->ier = I915_READ(GEN8_DE_MISC_IER);
		for (i = 0; i < 4; i++)
			error->gtier[i] = I915_READ(GEN8_GT_IER(i));
1775
		error->ngtier = 4;
1776
	} else if (HAS_PCH_SPLIT(dev_priv)) {
1777
		error->ier = I915_READ(DEIER);
1778
		error->gtier[0] = I915_READ(GTIER);
1779
		error->ngtier = 1;
1780
	} else if (IS_GEN2(dev_priv)) {
1781
		error->ier = I915_READ16(IER);
1782
	} else if (!IS_VALLEYVIEW(dev_priv)) {
1783
		error->ier = I915_READ(IER);
1784 1785 1786
	}
	error->eir = I915_READ(EIR);
	error->pgtbl_er = I915_READ(PGTBL_ER);
1787 1788
}

1789
static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
1790
				   struct i915_gpu_state *error,
1791
				   u32 engine_mask,
1792
				   const char *error_msg)
1793 1794
{
	u32 ecode;
1795
	int engine_id = -1, len;
1796

1797
	ecode = i915_error_generate_code(dev_priv, error, &engine_id);
1798

1799
	len = scnprintf(error->error_msg, sizeof(error->error_msg),
1800
			"GPU HANG: ecode %d:%d:0x%08x",
1801
			INTEL_GEN(dev_priv), engine_id, ecode);
1802

1803
	if (engine_id != -1 && error->engine[engine_id].context.pid)
1804 1805 1806
		len += scnprintf(error->error_msg + len,
				 sizeof(error->error_msg) - len,
				 ", in %s [%d]",
1807 1808
				 error->engine[engine_id].context.comm,
				 error->engine[engine_id].context.pid);
1809 1810 1811 1812

	scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
		  ", reason: %s, action: %s",
		  error_msg,
1813
		  engine_mask ? "reset" : "continue");
1814 1815
}

1816
static void capture_gen_state(struct i915_gpu_state *error)
1817
{
1818 1819 1820 1821 1822
	struct drm_i915_private *i915 = error->i915;

	error->awake = i915->gt.awake;
	error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count);
	error->suspended = i915->runtime_pm.suspended;
1823

1824 1825 1826 1827
	error->iommu = -1;
#ifdef CONFIG_INTEL_IOMMU
	error->iommu = intel_iommu_gfx_mapped;
#endif
1828 1829
	error->reset_count = i915_reset_count(&i915->gpu_error);
	error->suspend_count = i915->suspend_count;
1830 1831

	memcpy(&error->device_info,
1832
	       INTEL_INFO(i915),
1833
	       sizeof(error->device_info));
1834
	error->driver_caps = i915->caps;
1835 1836
}

1837 1838 1839 1840 1841 1842
static __always_inline void dup_param(const char *type, void *x)
{
	if (!__builtin_strcmp(type, "char *"))
		*(void **)x = kstrdup(*(void **)x, GFP_ATOMIC);
}

1843 1844 1845 1846 1847 1848 1849 1850
static void capture_params(struct i915_gpu_state *error)
{
	error->params = i915_modparams;
#define DUP(T, x, ...) dup_param(#T, &error->params.x);
	I915_PARAMS_FOR_EACH(DUP);
#undef DUP
}

1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866
static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
{
	unsigned long epoch = error->capture;
	int i;

	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
		const struct drm_i915_error_engine *ee = &error->engine[i];

		if (ee->hangcheck_stalled &&
		    time_before(ee->hangcheck_timestamp, epoch))
			epoch = ee->hangcheck_timestamp;
	}

	return epoch;
}

1867 1868 1869 1870 1871 1872 1873 1874
static void capture_finish(struct i915_gpu_state *error)
{
	struct i915_ggtt *ggtt = &error->i915->ggtt;
	const u64 slot = ggtt->error_capture.start;

	ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE);
}

1875 1876
static int capture(void *data)
{
1877
	struct i915_gpu_state *error = data;
1878

A
Arnd Bergmann 已提交
1879 1880 1881 1882
	error->time = ktime_get_real();
	error->boottime = ktime_get_boottime();
	error->uptime = ktime_sub(ktime_get(),
				  error->i915->gt.last_init_time);
1883
	error->capture = jiffies;
1884

1885
	capture_params(error);
1886
	capture_gen_state(error);
1887
	capture_uc_state(error);
1888 1889 1890 1891 1892
	capture_reg_state(error);
	gem_record_fences(error);
	gem_record_rings(error);
	capture_active_buffers(error);
	capture_pinned_buffers(error);
1893 1894 1895 1896

	error->overlay = intel_overlay_capture_error_state(error->i915);
	error->display = intel_display_capture_error_state(error->i915);

1897 1898
	error->epoch = capture_find_epoch(error);

1899
	capture_finish(error);
1900 1901 1902
	return 0;
}

1903 1904
#define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))

1905 1906 1907 1908 1909
struct i915_gpu_state *
i915_capture_gpu_state(struct drm_i915_private *i915)
{
	struct i915_gpu_state *error;

1910 1911 1912 1913 1914
	/* Check if GPU capture has been disabled */
	error = READ_ONCE(i915->gpu_error.first_error);
	if (IS_ERR(error))
		return error;

1915
	error = kzalloc(sizeof(*error), GFP_ATOMIC);
1916 1917 1918 1919
	if (!error) {
		i915_disable_error_state(i915, -ENOMEM);
		return ERR_PTR(-ENOMEM);
	}
1920 1921 1922 1923 1924 1925 1926 1927 1928

	kref_init(&error->ref);
	error->i915 = i915;

	stop_machine(capture, error, NULL);

	return error;
}

1929 1930
/**
 * i915_capture_error_state - capture an error record for later analysis
1931 1932 1933
 * @i915: i915 device
 * @engine_mask: the mask of engines triggering the hang
 * @error_msg: a message to insert into the error capture header
1934 1935 1936 1937 1938 1939
 *
 * Should be called when an error is detected (either a hang or an error
 * interrupt) to capture error state from the time of the error.  Fills
 * out a structure which becomes available in debugfs for user level tools
 * to pick up.
 */
1940
void i915_capture_error_state(struct drm_i915_private *i915,
1941
			      u32 engine_mask,
1942
			      const char *error_msg)
1943
{
1944
	static bool warned;
1945
	struct i915_gpu_state *error;
1946 1947
	unsigned long flags;

1948
	if (!i915_modparams.error_capture)
1949 1950
		return;

1951
	if (READ_ONCE(i915->gpu_error.first_error))
1952 1953
		return;

1954
	error = i915_capture_gpu_state(i915);
1955
	if (IS_ERR(error))
1956 1957
		return;

1958
	i915_error_capture_msg(i915, error, engine_mask, error_msg);
1959 1960
	DRM_INFO("%s\n", error->error_msg);

1961
	if (!error->simulated) {
1962 1963 1964
		spin_lock_irqsave(&i915->gpu_error.lock, flags);
		if (!i915->gpu_error.first_error) {
			i915->gpu_error.first_error = error;
1965 1966
			error = NULL;
		}
1967
		spin_unlock_irqrestore(&i915->gpu_error.lock, flags);
1968 1969
	}

1970
	if (error) {
1971
		__i915_gpu_state_free(&error->ref);
1972 1973 1974
		return;
	}

1975 1976
	if (!warned &&
	    ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) {
1977 1978 1979 1980
		DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
		DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
		DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
		DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
1981
		DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n",
1982
			 i915->drm.primary->index);
1983 1984
		warned = true;
	}
1985 1986
}

1987 1988
struct i915_gpu_state *
i915_first_error_state(struct drm_i915_private *i915)
1989
{
1990
	struct i915_gpu_state *error;
1991

1992 1993
	spin_lock_irq(&i915->gpu_error.lock);
	error = i915->gpu_error.first_error;
1994
	if (!IS_ERR_OR_NULL(error))
1995 1996
		i915_gpu_state_get(error);
	spin_unlock_irq(&i915->gpu_error.lock);
1997

1998
	return error;
1999 2000
}

2001
void i915_reset_error_state(struct drm_i915_private *i915)
2002
{
2003
	struct i915_gpu_state *error;
2004

2005 2006
	spin_lock_irq(&i915->gpu_error.lock);
	error = i915->gpu_error.first_error;
2007 2008
	if (error != ERR_PTR(-ENODEV)) /* if disabled, always disabled */
		i915->gpu_error.first_error = NULL;
2009
	spin_unlock_irq(&i915->gpu_error.lock);
2010

2011
	if (!IS_ERR_OR_NULL(error))
2012 2013 2014 2015 2016 2017 2018 2019 2020
		i915_gpu_state_put(error);
}

void i915_disable_error_state(struct drm_i915_private *i915, int err)
{
	spin_lock_irq(&i915->gpu_error.lock);
	if (!i915->gpu_error.first_error)
		i915->gpu_error.first_error = ERR_PTR(err);
	spin_unlock_irq(&i915->gpu_error.lock);
2021
}