mce_amd.c 24.6 KB
Newer Older
1
#include <linux/module.h>
2 3
#include <linux/slab.h>

B
Borislav Petkov 已提交
4
#include "mce_amd.h"
D
Doug Thompson 已提交
5

6 7
static struct amd_decoder_ops *fam_ops;

8
static u8 xec_mask	 = 0xf;
9

10
static bool report_gart_errors;
11
static void (*decode_dram_ecc)(int node_id, struct mce *m);
12 13 14 15 16 17 18

void amd_report_gart_errors(bool v)
{
	report_gart_errors = v;
}
EXPORT_SYMBOL_GPL(amd_report_gart_errors);

19
void amd_register_ecc_decoder(void (*f)(int, struct mce *))
20
{
21
	decode_dram_ecc = f;
22 23 24
}
EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);

25
void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
26
{
27 28
	if (decode_dram_ecc) {
		WARN_ON(decode_dram_ecc != f);
29

30
		decode_dram_ecc = NULL;
31 32 33 34
	}
}
EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);

D
Doug Thompson 已提交
35 36 37 38
/*
 * string representation for the different MCA reported error types, see F3x48
 * or MSR0000_0411.
 */
B
Borislav Petkov 已提交
39 40

/* transaction type */
41
static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
D
Doug Thompson 已提交
42

B
Borislav Petkov 已提交
43
/* cache level */
44
static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
D
Doug Thompson 已提交
45

B
Borislav Petkov 已提交
46
/* memory transaction type */
47
static const char * const rrrr_msgs[] = {
B
Borislav Petkov 已提交
48
       "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
D
Doug Thompson 已提交
49 50
};

B
Borislav Petkov 已提交
51
/* participating processor */
52
const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
53
EXPORT_SYMBOL_GPL(pp_msgs);
D
Doug Thompson 已提交
54

B
Borislav Petkov 已提交
55
/* request timeout */
56
static const char * const to_msgs[] = { "no timeout", "timed out" };
D
Doug Thompson 已提交
57

B
Borislav Petkov 已提交
58
/* memory or i/o */
59
static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
D
Doug Thompson 已提交
60

61
/* internal error type */
62
static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
63

64
static const char * const f15h_mc1_mce_desc[] = {
65 66 67 68 69 70 71 72 73 74 75 76
	"UC during a demand linefill from L2",
	"Parity error during data load from IC",
	"Parity error for IC valid bit",
	"Main tag parity error",
	"Parity error in prediction queue",
	"PFB data/address parity error",
	"Parity error in the branch status reg",
	"PFB promotion address error",
	"Tag error during probe/victimization",
	"Parity error for IC probe tag valid bit",
	"PFB non-cacheable bit parity error",
	"PFB valid bit parity error",			/* xec = 0xd */
77
	"Microcode Patch Buffer",			/* xec = 010 */
78 79 80
	"uop queue",
	"insn buffer",
	"predecode buffer",
81 82
	"fetch address FIFO",
	"dispatch uop queue"
83 84
};

85
static const char * const f15h_mc2_mce_desc[] = {
86 87 88 89 90 91 92 93
	"Fill ECC error on data fills",			/* xec = 0x4 */
	"Fill parity error on insn fills",
	"Prefetcher request FIFO parity error",
	"PRQ address parity error",
	"PRQ data parity error",
	"WCC Tag ECC error",
	"WCC Data ECC error",
	"WCB Data parity error",
94
	"VB Data ECC or parity error",
95 96 97 98 99 100 101
	"L2 Tag ECC error",				/* xec = 0x10 */
	"Hard L2 Tag ECC error",
	"Multiple hits on L2 tag",
	"XAB parity error",
	"PRB address parity error"
};

102
static const char * const mc4_mce_desc[] = {
103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
	"DRAM ECC error detected on the NB",
	"CRC error detected on HT link",
	"Link-defined sync error packets detected on HT link",
	"HT Master abort",
	"HT Target abort",
	"Invalid GART PTE entry during GART table walk",
	"Unsupported atomic RMW received from an IO link",
	"Watchdog timeout due to lack of progress",
	"DRAM ECC error detected on the NB",
	"SVM DMA Exclusion Vector error",
	"HT data error detected on link",
	"Protocol error (link, L3, probe filter)",
	"NB internal arrays parity error",
	"DRAM addr/ctl signals parity error",
	"IO link transmission error",
	"L3 data cache ECC error",			/* xec = 0x1c */
	"L3 cache tag error",
	"L3 LRU parity bits error",
	"ECC Error in the Probe Filter directory"
};

124
static const char * const mc5_mce_desc[] = {
125 126 127 128 129 130 131 132 133 134 135 136
	"CPU Watchdog timer expire",
	"Wakeup array dest tag",
	"AG payload array",
	"EX payload array",
	"IDRF array",
	"Retire dispatch queue",
	"Mapper checkpoint array",
	"Physical register file EX0 port",
	"Physical register file EX1 port",
	"Physical register file AG0 port",
	"Physical register file AG1 port",
	"Flag register file",
137 138
	"DE error occurred",
	"Retire status queue"
139 140
};

141 142 143 144 145 146 147 148 149
static const char * const mc6_mce_desc[] = {
	"Hardware Assertion",
	"Free List",
	"Physical Register File",
	"Retire Queue",
	"Scheduler table",
	"Status Register File",
};

150
/* Scalable MCA error strings */
151
static const char * const smca_ls_mce_desc[] = {
152 153 154 155
	"Load queue parity",
	"Store queue parity",
	"Miss address buffer payload parity",
	"L1 TLB parity",
156
	"Reserved",
157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
	"DC tag error type 6",
	"DC tag error type 1",
	"Internal error type 1",
	"Internal error type 2",
	"Sys Read data error thread 0",
	"Sys read data error thread 1",
	"DC tag error type 2",
	"DC data error type 1 (poison comsumption)",
	"DC data error type 2",
	"DC data error type 3",
	"DC tag error type 4",
	"L2 TLB parity",
	"PDC parity error",
	"DC tag error type 3",
	"DC tag error type 5",
	"L2 fill data error",
};

175
static const char * const smca_if_mce_desc[] = {
176 177 178 179 180 181 182 183 184 185 186 187
	"microtag probe port parity error",
	"IC microtag or full tag multi-hit error",
	"IC full tag parity",
	"IC data array parity",
	"Decoupling queue phys addr parity error",
	"L0 ITLB parity error",
	"L1 ITLB parity error",
	"L2 ITLB parity error",
	"BPQ snoop parity on Thread 0",
	"BPQ snoop parity on Thread 1",
	"L1 BTB multi-match error",
	"L2 BTB multi-match error",
188 189
	"L2 Cache Response Poison error",
	"System Read Data error",
190 191
};

192
static const char * const smca_l2_mce_desc[] = {
193 194 195 196 197 198
	"L2M tag multi-way-hit error",
	"L2M tag ECC error",
	"L2M data ECC error",
	"HW assert",
};

199
static const char * const smca_de_mce_desc[] = {
200 201 202
	"uop cache tag parity error",
	"uop cache data parity error",
	"Insn buffer parity error",
203
	"uop queue parity error",
204 205 206 207 208 209 210
	"Insn dispatch queue parity error",
	"Fetch address FIFO parity",
	"Patch RAM data parity",
	"Patch RAM sequencer parity",
	"uop buffer parity"
};

211
static const char * const smca_ex_mce_desc[] = {
212 213 214 215 216 217 218 219
	"Watchdog timeout error",
	"Phy register file parity",
	"Flag register file parity",
	"Immediate displacement register file parity",
	"Address generator payload parity",
	"EX payload parity",
	"Checkpoint queue parity",
	"Retire dispatch queue parity",
220 221 222
	"Retire status queue parity error",
	"Scheduling queue parity error",
	"Branch buffer queue parity error",
223 224
};

225
static const char * const smca_fp_mce_desc[] = {
226 227 228 229 230 231
	"Physical register file parity",
	"Freelist parity error",
	"Schedule queue parity",
	"NSQ parity error",
	"Retire queue parity",
	"Status register file parity",
232
	"Hardware assertion",
233 234
};

235
static const char * const smca_l3_mce_desc[] = {
236 237 238 239 240 241 242 243 244 245
	"Shadow tag macro ECC error",
	"Shadow tag macro multi-way-hit error",
	"L3M tag ECC error",
	"L3M tag multi-way-hit error",
	"L3M data ECC error",
	"XI parity, L3 fill done channel error",
	"L3 victim queue parity",
	"L3 HW assert",
};

246
static const char * const smca_cs_mce_desc[] = {
247 248 249 250 251 252 253 254 255 256 257
	"Illegal request from transport layer",
	"Address violation",
	"Security violation",
	"Illegal response from transport layer",
	"Unexpected response",
	"Parity error on incoming request or probe response data",
	"Parity error on incoming read response data",
	"Atomic request parity",
	"ECC error on probe filter access",
};

258
static const char * const smca_pie_mce_desc[] = {
259 260 261 262 263 264
	"HW assert",
	"Internal PIE register security violation",
	"Error on GMI link",
	"Poison data written to internal PIE register",
};

265
static const char * const smca_umc_mce_desc[] = {
266 267 268 269 270 271 272 273
	"DRAM ECC error",
	"Data poison error on DRAM",
	"SDP parity error",
	"Advanced peripheral bus error",
	"Command/address parity error",
	"Write data CRC error",
};

274
static const char * const smca_pb_mce_desc[] = {
275 276 277
	"Parameter Block RAM ECC error",
};

278
static const char * const smca_psp_mce_desc[] = {
279 280 281
	"PSP RAM ECC or parity error",
};

282
static const char * const smca_smu_mce_desc[] = {
283 284 285
	"SMU RAM ECC or parity error",
};

286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
struct smca_mce_desc {
	const char * const *descs;
	unsigned int num_descs;
};

static struct smca_mce_desc smca_mce_descs[] = {
	[SMCA_LS]	= { smca_ls_mce_desc,	ARRAY_SIZE(smca_ls_mce_desc)	},
	[SMCA_IF]	= { smca_if_mce_desc,	ARRAY_SIZE(smca_if_mce_desc)	},
	[SMCA_L2_CACHE]	= { smca_l2_mce_desc,	ARRAY_SIZE(smca_l2_mce_desc)	},
	[SMCA_DE]	= { smca_de_mce_desc,	ARRAY_SIZE(smca_de_mce_desc)	},
	[SMCA_EX]	= { smca_ex_mce_desc,	ARRAY_SIZE(smca_ex_mce_desc)	},
	[SMCA_FP]	= { smca_fp_mce_desc,	ARRAY_SIZE(smca_fp_mce_desc)	},
	[SMCA_L3_CACHE]	= { smca_l3_mce_desc,	ARRAY_SIZE(smca_l3_mce_desc)	},
	[SMCA_CS]	= { smca_cs_mce_desc,	ARRAY_SIZE(smca_cs_mce_desc)	},
	[SMCA_PIE]	= { smca_pie_mce_desc,	ARRAY_SIZE(smca_pie_mce_desc)	},
	[SMCA_UMC]	= { smca_umc_mce_desc,	ARRAY_SIZE(smca_umc_mce_desc)	},
	[SMCA_PB]	= { smca_pb_mce_desc,	ARRAY_SIZE(smca_pb_mce_desc)	},
	[SMCA_PSP]	= { smca_psp_mce_desc,	ARRAY_SIZE(smca_psp_mce_desc)	},
	[SMCA_SMU]	= { smca_smu_mce_desc,	ARRAY_SIZE(smca_smu_mce_desc)	},
};

307
static bool f12h_mc0_mce(u16 ec, u8 xec)
308
{
309
	bool ret = false;
310

311
	if (MEM_ERROR(ec)) {
312
		u8 ll = LL(ec);
313
		ret = true;
314

315 316 317
		if (ll == LL_L2)
			pr_cont("during L1 linefill from L2.\n");
		else if (ll == LL_L1)
318
			pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
319 320 321 322 323
		else
			ret = false;
	}
	return ret;
}
324

325
static bool f10h_mc0_mce(u16 ec, u8 xec)
326
{
327
	if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
328 329 330
		pr_cont("during data scrub.\n");
		return true;
	}
331
	return f12h_mc0_mce(ec, xec);
332 333
}

334
static bool k8_mc0_mce(u16 ec, u8 xec)
335 336 337 338 339
{
	if (BUS_ERROR(ec)) {
		pr_cont("during system linefill.\n");
		return true;
	}
340

341
	return f10h_mc0_mce(ec, xec);
342 343
}

344
static bool cat_mc0_mce(u16 ec, u8 xec)
345
{
346
	u8 r4	 = R4(ec);
347 348 349 350
	bool ret = true;

	if (MEM_ERROR(ec)) {

351
		if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370
			return false;

		switch (r4) {
		case R4_DRD:
		case R4_DWR:
			pr_cont("Data/Tag parity error due to %s.\n",
				(r4 == R4_DRD ? "load/hw prf" : "store"));
			break;
		case R4_EVICT:
			pr_cont("Copyback parity error on a tag miss.\n");
			break;
		case R4_SNOOP:
			pr_cont("Tag parity error during snoop.\n");
			break;
		default:
			ret = false;
		}
	} else if (BUS_ERROR(ec)) {

371
		if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395
			return false;

		pr_cont("System read data error on a ");

		switch (r4) {
		case R4_RD:
			pr_cont("TLB reload.\n");
			break;
		case R4_DWR:
			pr_cont("store.\n");
			break;
		case R4_DRD:
			pr_cont("load.\n");
			break;
		default:
			ret = false;
		}
	} else {
		ret = false;
	}

	return ret;
}

396
static bool f15h_mc0_mce(u16 ec, u8 xec)
397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433
{
	bool ret = true;

	if (MEM_ERROR(ec)) {

		switch (xec) {
		case 0x0:
			pr_cont("Data Array access error.\n");
			break;

		case 0x1:
			pr_cont("UC error during a linefill from L2/NB.\n");
			break;

		case 0x2:
		case 0x11:
			pr_cont("STQ access error.\n");
			break;

		case 0x3:
			pr_cont("SCB access error.\n");
			break;

		case 0x10:
			pr_cont("Tag error.\n");
			break;

		case 0x12:
			pr_cont("LDQ access error.\n");
			break;

		default:
			ret = false;
		}
	} else if (BUS_ERROR(ec)) {

		if (!xec)
434
			pr_cont("System Read Data Error.\n");
435
		else
436
			pr_cont(" Internal error condition type %d.\n", xec);
437 438 439 440 441 442
	} else if (INT_ERROR(ec)) {
		if (xec <= 0x1f)
			pr_cont("Hardware Assert.\n");
		else
			ret = false;

443 444 445 446 447 448
	} else
		ret = false;

	return ret;
}

449
static void decode_mc0_mce(struct mce *m)
450
{
451 452
	u16 ec = EC(m->status);
	u8 xec = XEC(m->status, xec_mask);
453

454
	pr_emerg(HW_ERR "MC0 Error: ");
455 456 457

	/* TLB error signatures are the same across families */
	if (TLB_ERROR(ec)) {
458
		if (TT(ec) == TT_DATA) {
459
			pr_cont("%s TLB %s.\n", LL_MSG(ec),
460 461
				((xec == 2) ? "locked miss"
					    : (xec ? "multimatch" : "parity")));
462 463
			return;
		}
464
	} else if (fam_ops->mc0_mce(ec, xec))
465 466
		;
	else
467
		pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
468 469
}

470
static bool k8_mc1_mce(u16 ec, u8 xec)
471
{
472
	u8 ll	 = LL(ec);
473
	bool ret = true;
474

475 476
	if (!MEM_ERROR(ec))
		return false;
477

478 479 480
	if (ll == 0x2)
		pr_cont("during a linefill from L2.\n");
	else if (ll == 0x1) {
481
		switch (R4(ec)) {
482 483 484
		case R4_IRD:
			pr_cont("Parity error during data load.\n");
			break;
485

486 487 488 489 490 491 492 493 494 495 496 497
		case R4_EVICT:
			pr_cont("Copyback Parity/Victim error.\n");
			break;

		case R4_SNOOP:
			pr_cont("Tag Snoop error.\n");
			break;

		default:
			ret = false;
			break;
		}
498
	} else
499
		ret = false;
500

501 502 503
	return ret;
}

504
static bool cat_mc1_mce(u16 ec, u8 xec)
505
{
506
	u8 r4    = R4(ec);
507
	bool ret = true;
508

509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524
	if (!MEM_ERROR(ec))
		return false;

	if (TT(ec) != TT_INSTR)
		return false;

	if (r4 == R4_IRD)
		pr_cont("Data/tag array parity error for a tag hit.\n");
	else if (r4 == R4_SNOOP)
		pr_cont("Tag error during snoop/victimization.\n");
	else if (xec == 0x0)
		pr_cont("Tag parity error from victim castout.\n");
	else if (xec == 0x2)
		pr_cont("Microcode patch RAM parity error.\n");
	else
		ret = false;
525 526 527 528

	return ret;
}

529
static bool f15h_mc1_mce(u16 ec, u8 xec)
530 531 532 533 534 535 536 537
{
	bool ret = true;

	if (!MEM_ERROR(ec))
		return false;

	switch (xec) {
	case 0x0 ... 0xa:
538
		pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
539 540 541
		break;

	case 0xd:
542
		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
543 544
		break;

545
	case 0x10:
546
		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
547 548
		break;

549
	case 0x11 ... 0x15:
550
		pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
551 552 553 554 555 556 557 558
		break;

	default:
		ret = false;
	}
	return ret;
}

559
static void decode_mc1_mce(struct mce *m)
560
{
561 562
	u16 ec = EC(m->status);
	u8 xec = XEC(m->status, xec_mask);
563

564
	pr_emerg(HW_ERR "MC1 Error: ");
565 566 567 568 569

	if (TLB_ERROR(ec))
		pr_cont("%s TLB %s.\n", LL_MSG(ec),
			(xec ? "multimatch" : "parity error"));
	else if (BUS_ERROR(ec)) {
570
		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
571 572

		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
573 574 575 576 577
	} else if (INT_ERROR(ec)) {
		if (xec <= 0x3f)
			pr_cont("Hardware Assert.\n");
		else
			goto wrong_mc1_mce;
578
	} else if (fam_ops->mc1_mce(ec, xec))
579 580
		;
	else
581 582 583 584 585 586
		goto wrong_mc1_mce;

	return;

wrong_mc1_mce:
	pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
587 588
}

589
static bool k8_mc2_mce(u16 ec, u8 xec)
590
{
591
	bool ret = true;
592 593 594 595 596 597

	if (xec == 0x1)
		pr_cont(" in the write data buffers.\n");
	else if (xec == 0x3)
		pr_cont(" in the victim data buffers.\n");
	else if (xec == 0x2 && MEM_ERROR(ec))
598
		pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
599 600
	else if (xec == 0x0) {
		if (TLB_ERROR(ec))
601 602
			pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
				TT_MSG(ec));
603 604
		else if (BUS_ERROR(ec))
			pr_cont(": %s/ECC error in data read from NB: %s.\n",
605
				R4_MSG(ec), PP_MSG(ec));
606
		else if (MEM_ERROR(ec)) {
607
			u8 r4 = R4(ec);
608

609
			if (r4 >= 0x7)
610
				pr_cont(": %s error during data copyback.\n",
611 612
					R4_MSG(ec));
			else if (r4 <= 0x1)
613
				pr_cont(": %s parity/ECC error during data "
614
					"access from L2.\n", R4_MSG(ec));
615
			else
616
				ret = false;
617
		} else
618
			ret = false;
619
	} else
620
		ret = false;
621

622
	return ret;
623 624
}

625
static bool f15h_mc2_mce(u16 ec, u8 xec)
626
{
627
	bool ret = true;
628 629 630 631 632 633 634

	if (TLB_ERROR(ec)) {
		if (xec == 0x0)
			pr_cont("Data parity TLB read error.\n");
		else if (xec == 0x1)
			pr_cont("Poison data provided for TLB fill.\n");
		else
635
			ret = false;
636 637
	} else if (BUS_ERROR(ec)) {
		if (xec > 2)
638
			ret = false;
639 640 641 642 643

		pr_cont("Error during attempted NB data read.\n");
	} else if (MEM_ERROR(ec)) {
		switch (xec) {
		case 0x4 ... 0xc:
644
			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
645 646 647
			break;

		case 0x10 ... 0x14:
648
			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
649 650 651
			break;

		default:
652
			ret = false;
653
		}
654 655 656 657 658
	} else if (INT_ERROR(ec)) {
		if (xec <= 0x3f)
			pr_cont("Hardware Assert.\n");
		else
			ret = false;
659 660
	}

661 662 663
	return ret;
}

664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704
static bool f16h_mc2_mce(u16 ec, u8 xec)
{
	u8 r4 = R4(ec);

	if (!MEM_ERROR(ec))
		return false;

	switch (xec) {
	case 0x04 ... 0x05:
		pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
		break;

	case 0x09 ... 0x0b:
	case 0x0d ... 0x0f:
		pr_cont("ECC error in L2 tag (%s).\n",
			((r4 == R4_GEN)   ? "BankReq" :
			((r4 == R4_SNOOP) ? "Prb"     : "Fill")));
		break;

	case 0x10 ... 0x19:
	case 0x1b:
		pr_cont("ECC error in L2 data array (%s).\n",
			(((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
			((r4 == R4_GEN)   ? "Attr" :
			((r4 == R4_EVICT) ? "Vict" : "Fill"))));
		break;

	case 0x1c ... 0x1d:
	case 0x1f:
		pr_cont("Parity error in L2 attribute bits (%s).\n",
			((r4 == R4_RD)  ? "Hit"  :
			((r4 == R4_GEN) ? "Attr" : "Fill")));
		break;

	default:
		return false;
	}

	return true;
}

705 706 707 708
static void decode_mc2_mce(struct mce *m)
{
	u16 ec = EC(m->status);
	u8 xec = XEC(m->status, xec_mask);
709

710 711 712 713
	pr_emerg(HW_ERR "MC2 Error: ");

	if (!fam_ops->mc2_mce(ec, xec))
		pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
714 715
}

716
static void decode_mc3_mce(struct mce *m)
717
{
718 719
	u16 ec = EC(m->status);
	u8 xec = XEC(m->status, xec_mask);
720

721
	if (boot_cpu_data.x86 >= 0x14) {
722
		pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
723 724 725
			 " please report on LKML.\n");
		return;
	}
726

727
	pr_emerg(HW_ERR "MC3 Error");
728 729

	if (xec == 0x0) {
730
		u8 r4 = R4(ec);
731

732
		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
733
			goto wrong_mc3_mce;
734

735
		pr_cont(" during %s.\n", R4_MSG(ec));
736
	} else
737
		goto wrong_mc3_mce;
738

739 740
	return;

741 742
 wrong_mc3_mce:
	pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
743 744
}

745
static void decode_mc4_mce(struct mce *m)
746
{
747 748 749 750 751
	struct cpuinfo_x86 *c = &boot_cpu_data;
	int node_id = amd_get_nb_id(m->extcpu);
	u16 ec = EC(m->status);
	u8 xec = XEC(m->status, 0x1f);
	u8 offset = 0;
752

753
	pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
754

755 756
	switch (xec) {
	case 0x0 ... 0xe:
757

758 759 760 761
		/* special handling for DRAM ECCs */
		if (xec == 0x0 || xec == 0x8) {
			/* no ECCs on F11h */
			if (c->x86 == 0x11)
762
				goto wrong_mc4_mce;
763

764
			pr_cont("%s.\n", mc4_mce_desc[xec]);
765

766 767
			if (decode_dram_ecc)
				decode_dram_ecc(node_id, m);
768 769
			return;
		}
770 771 772 773 774 775 776 777
		break;

	case 0xf:
		if (TLB_ERROR(ec))
			pr_cont("GART Table Walk data error.\n");
		else if (BUS_ERROR(ec))
			pr_cont("DMA Exclusion Vector Table Walk error.\n");
		else
778
			goto wrong_mc4_mce;
779
		return;
780

781
	case 0x19:
782
		if (boot_cpu_data.x86 == 0x15 || boot_cpu_data.x86 == 0x16)
783 784
			pr_cont("Compute Unit Data Error.\n");
		else
785
			goto wrong_mc4_mce;
786
		return;
787

788
	case 0x1c ... 0x1f:
789
		offset = 13;
790 791 792
		break;

	default:
793
		goto wrong_mc4_mce;
794
	}
795

796
	pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
797 798
	return;

799 800
 wrong_mc4_mce:
	pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
801 802
}

803
static void decode_mc5_mce(struct mce *m)
B
Borislav Petkov 已提交
804
{
805
	struct cpuinfo_x86 *c = &boot_cpu_data;
806
	u16 ec = EC(m->status);
807
	u8 xec = XEC(m->status, xec_mask);
808 809

	if (c->x86 == 0xf || c->x86 == 0x11)
810
		goto wrong_mc5_mce;
B
Borislav Petkov 已提交
811

812
	pr_emerg(HW_ERR "MC5 Error: ");
813

814 815 816 817 818 819 820 821
	if (INT_ERROR(ec)) {
		if (xec <= 0x1f) {
			pr_cont("Hardware Assert.\n");
			return;
		} else
			goto wrong_mc5_mce;
	}

822
	if (xec == 0x0 || xec == 0xc)
823
		pr_cont("%s.\n", mc5_mce_desc[xec]);
824
	else if (xec <= 0xd)
825
		pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
826
	else
827
		goto wrong_mc5_mce;
828 829

	return;
B
Borislav Petkov 已提交
830

831 832
 wrong_mc5_mce:
	pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
B
Borislav Petkov 已提交
833 834
}

835
static void decode_mc6_mce(struct mce *m)
836
{
837
	u8 xec = XEC(m->status, xec_mask);
838

839
	pr_emerg(HW_ERR "MC6 Error: ");
840

841
	if (xec > 0x5)
842
		goto wrong_mc6_mce;
843

844
	pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
845 846
	return;

847 848
 wrong_mc6_mce:
	pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
849 850
}

851 852 853
/* Decode errors according to Scalable MCA specification */
static void decode_smca_errors(struct mce *m)
{
854
	struct smca_hwid *hwid;
855
	unsigned int bank_type;
856
	const char *ip_name;
857
	u8 xec = XEC(m->status, xec_mask);
858

859
	if (m->bank >= ARRAY_SIZE(smca_banks))
860 861
		return;

862 863 864
	if (boot_cpu_data.x86 >= 0x17 && m->bank == 4)
		pr_emerg(HW_ERR "Bank 4 is reserved on Fam17h.\n");

865 866
	hwid = smca_banks[m->bank].hwid;
	if (!hwid)
867 868
		return;

869
	bank_type = hwid->bank_type;
B
Borislav Petkov 已提交
870
	ip_name = smca_get_long_name(bank_type);
871

872
	pr_emerg(HW_ERR "%s Extended Error Code: %d\n", ip_name, xec);
873

874 875
	/* Only print the decode of valid error codes */
	if (xec < smca_mce_descs[bank_type].num_descs &&
876
			(hwid->xec_bitmap & BIT_ULL(xec))) {
877 878 879
		pr_emerg(HW_ERR "%s Error: ", ip_name);
		pr_cont("%s.\n", smca_mce_descs[bank_type].descs[xec]);
	}
880 881 882 883 884 885 886

	/*
	 * amd_get_nb_id() returns the last level cache id.
	 * The last level cache on Fam17h is 1 level below the node.
	 */
	if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
		decode_dram_ecc(amd_get_nb_id(m->extcpu) >> 1, m);
887 888
}

B
Borislav Petkov 已提交
889
static inline void amd_decode_err_code(u16 ec)
890
{
891 892 893 894
	if (INT_ERROR(ec)) {
		pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
		return;
	}
895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910

	pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));

	if (BUS_ERROR(ec))
		pr_cont(", mem/io: %s", II_MSG(ec));
	else
		pr_cont(", tx: %s", TT_MSG(ec));

	if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
		pr_cont(", mem-tx: %s", R4_MSG(ec));

		if (BUS_ERROR(ec))
			pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
	}

	pr_cont("\n");
911 912
}

913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928
/*
 * Filter out unwanted MCE signatures here.
 */
static bool amd_filter_mce(struct mce *m)
{
	u8 xec = (m->status >> 16) & 0x1f;

	/*
	 * NB GART TLB error reporting is disabled by default.
	 */
	if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
		return true;

	return false;
}

B
Borislav Petkov 已提交
929 930 931 932 933 934 935 936 937 938 939
static const char *decode_error_status(struct mce *m)
{
	if (m->status & MCI_STATUS_UC) {
		if (m->status & MCI_STATUS_PCC)
			return "System Fatal error.";
		if (m->mcgstatus & MCG_STATUS_RIPV)
			return "Uncorrected, software restartable error.";
		return "Uncorrected, software containable error.";
	}

	if (m->status & MCI_STATUS_DEFERRED)
940
		return "Deferred error, no action required.";
B
Borislav Petkov 已提交
941 942 943 944

	return "Corrected error, no action required.";
}

B
Borislav Petkov 已提交
945
int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
946
{
947
	struct mce *m = (struct mce *)data;
948
	struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
949
	int ecc;
950

951 952 953
	if (amd_filter_mce(m))
		return NOTIFY_STOP;

954 955 956 957 958 959 960
	pr_emerg(HW_ERR "%s\n", decode_error_status(m));

	pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
		m->extcpu,
		c->x86, c->x86_model, c->x86_mask,
		m->bank,
		((m->status & MCI_STATUS_OVER)	? "Over"  : "-"),
961 962
		((m->status & MCI_STATUS_UC)	? "UE"	  :
		 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
963 964 965 966
		((m->status & MCI_STATUS_MISCV)	? "MiscV" : "-"),
		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"),
		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"));

967 968 969 970 971 972 973
	if (c->x86 >= 0x15) {
		pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));

		/* F15h, bank4, bit 43 is part of McaStatSubCache. */
		if (c->x86 != 0x15 || m->bank != 4)
			pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
	}
974

975
	if (boot_cpu_has(X86_FEATURE_SMCA)) {
976 977 978
		u32 low, high;
		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);

979 980
		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));

981 982 983 984 985
		if (!rdmsr_safe(addr, &low, &high) &&
		    (low & MCI_CONFIG_MCAX))
			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
	}

986 987 988 989 990 991 992 993
	/* do the two bits[14:13] together */
	ecc = (m->status >> 45) & 0x3;
	if (ecc)
		pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));

	pr_cont("]: 0x%016llx\n", m->status);

	if (m->status & MCI_STATUS_ADDRV)
994
		pr_emerg(HW_ERR "Error Addr: 0x%016llx", m->addr);
995

996
	if (boot_cpu_has(X86_FEATURE_SMCA)) {
997 998 999
		if (m->status & MCI_STATUS_SYNDV)
			pr_cont(", Syndrome: 0x%016llx", m->synd);

1000 1001
		pr_cont(", IPID: 0x%016llx", m->ipid);

1002 1003
		pr_cont("\n");

1004 1005
		decode_smca_errors(m);
		goto err_code;
1006 1007
	} else
		pr_cont("\n");
1008

1009 1010 1011
	if (!fam_ops)
		goto err_code;

1012 1013
	switch (m->bank) {
	case 0:
1014
		decode_mc0_mce(m);
1015
		break;
1016

1017
	case 1:
1018
		decode_mc1_mce(m);
1019 1020
		break;

1021
	case 2:
1022
		decode_mc2_mce(m);
1023 1024
		break;

1025
	case 3:
1026
		decode_mc3_mce(m);
1027 1028
		break;

1029
	case 4:
1030
		decode_mc4_mce(m);
1031 1032
		break;

B
Borislav Petkov 已提交
1033
	case 5:
1034
		decode_mc5_mce(m);
B
Borislav Petkov 已提交
1035 1036
		break;

1037
	case 6:
1038
		decode_mc6_mce(m);
1039 1040
		break;

1041 1042
	default:
		break;
1043
	}
1044

1045
 err_code:
1046
	amd_decode_err_code(m->status & 0xffff);
1047 1048

	return NOTIFY_STOP;
1049
}
B
Borislav Petkov 已提交
1050
EXPORT_SYMBOL_GPL(amd_decode_mce);
1051

1052 1053 1054 1055
static struct notifier_block amd_mce_dec_nb = {
	.notifier_call	= amd_decode_mce,
};

1056 1057
static int __init mce_amd_init(void)
{
1058 1059 1060
	struct cpuinfo_x86 *c = &boot_cpu_data;

	if (c->x86_vendor != X86_VENDOR_AMD)
1061
		return -ENODEV;
1062

1063 1064 1065 1066
	fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
	if (!fam_ops)
		return -ENOMEM;

1067
	switch (c->x86) {
1068
	case 0xf:
1069 1070
		fam_ops->mc0_mce = k8_mc0_mce;
		fam_ops->mc1_mce = k8_mc1_mce;
1071
		fam_ops->mc2_mce = k8_mc2_mce;
1072 1073 1074
		break;

	case 0x10:
1075 1076
		fam_ops->mc0_mce = f10h_mc0_mce;
		fam_ops->mc1_mce = k8_mc1_mce;
1077
		fam_ops->mc2_mce = k8_mc2_mce;
1078 1079
		break;

1080
	case 0x11:
1081 1082
		fam_ops->mc0_mce = k8_mc0_mce;
		fam_ops->mc1_mce = k8_mc1_mce;
1083
		fam_ops->mc2_mce = k8_mc2_mce;
1084 1085
		break;

1086
	case 0x12:
1087 1088
		fam_ops->mc0_mce = f12h_mc0_mce;
		fam_ops->mc1_mce = k8_mc1_mce;
1089
		fam_ops->mc2_mce = k8_mc2_mce;
1090 1091
		break;

1092
	case 0x14:
1093 1094
		fam_ops->mc0_mce = cat_mc0_mce;
		fam_ops->mc1_mce = cat_mc1_mce;
1095
		fam_ops->mc2_mce = k8_mc2_mce;
1096 1097
		break;

1098
	case 0x15:
1099 1100
		xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;

1101 1102
		fam_ops->mc0_mce = f15h_mc0_mce;
		fam_ops->mc1_mce = f15h_mc1_mce;
1103
		fam_ops->mc2_mce = f15h_mc2_mce;
1104 1105
		break;

1106 1107 1108 1109 1110 1111 1112
	case 0x16:
		xec_mask = 0x1f;
		fam_ops->mc0_mce = cat_mc0_mce;
		fam_ops->mc1_mce = cat_mc1_mce;
		fam_ops->mc2_mce = f16h_mc2_mce;
		break;

1113 1114
	case 0x17:
		xec_mask = 0x3f;
1115
		if (!boot_cpu_has(X86_FEATURE_SMCA)) {
1116 1117 1118 1119 1120
			printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
			goto err_out;
		}
		break;

1121
	default:
1122
		printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1123
		goto err_out;
1124 1125
	}

1126 1127
	pr_info("MCE: In-kernel MCE decoding enabled.\n");

1128
	mce_register_decode_chain(&amd_mce_dec_nb);
1129 1130

	return 0;
1131 1132 1133 1134 1135

err_out:
	kfree(fam_ops);
	fam_ops = NULL;
	return -EINVAL;
1136 1137
}
early_initcall(mce_amd_init);
1138 1139 1140 1141

#ifdef MODULE
static void __exit mce_amd_exit(void)
{
1142
	mce_unregister_decode_chain(&amd_mce_dec_nb);
1143
	kfree(fam_ops);
1144 1145 1146 1147 1148 1149 1150
}

MODULE_DESCRIPTION("AMD MCE decoder");
MODULE_ALIAS("edac-mce-amd");
MODULE_LICENSE("GPL");
module_exit(mce_amd_exit);
#endif