mce_amd.c 26.1 KB
Newer Older
1
#include <linux/module.h>
2 3
#include <linux/slab.h>

B
Borislav Petkov 已提交
4
#include "mce_amd.h"
D
Doug Thompson 已提交
5

6 7
static struct amd_decoder_ops *fam_ops;

8
static u8 xec_mask	 = 0xf;
9

10
static bool report_gart_errors;
11
static void (*nb_bus_decoder)(int node_id, struct mce *m);
12 13 14 15 16 17 18

void amd_report_gart_errors(bool v)
{
	report_gart_errors = v;
}
EXPORT_SYMBOL_GPL(amd_report_gart_errors);

19
void amd_register_ecc_decoder(void (*f)(int, struct mce *))
20 21 22 23 24
{
	nb_bus_decoder = f;
}
EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);

25
void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
26 27 28 29 30 31 32 33 34
{
	if (nb_bus_decoder) {
		WARN_ON(nb_bus_decoder != f);

		nb_bus_decoder = NULL;
	}
}
EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);

D
Doug Thompson 已提交
35 36 37 38
/*
 * string representation for the different MCA reported error types, see F3x48
 * or MSR0000_0411.
 */
B
Borislav Petkov 已提交
39 40

/* transaction type */
41
static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
D
Doug Thompson 已提交
42

B
Borislav Petkov 已提交
43
/* cache level */
44
static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
D
Doug Thompson 已提交
45

B
Borislav Petkov 已提交
46
/* memory transaction type */
47
static const char * const rrrr_msgs[] = {
B
Borislav Petkov 已提交
48
       "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
D
Doug Thompson 已提交
49 50
};

B
Borislav Petkov 已提交
51
/* participating processor */
52
const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
53
EXPORT_SYMBOL_GPL(pp_msgs);
D
Doug Thompson 已提交
54

B
Borislav Petkov 已提交
55
/* request timeout */
56
static const char * const to_msgs[] = { "no timeout", "timed out" };
D
Doug Thompson 已提交
57

B
Borislav Petkov 已提交
58
/* memory or i/o */
59
static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
D
Doug Thompson 已提交
60

61
/* internal error type */
62
static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
63

64
static const char * const f15h_mc1_mce_desc[] = {
65 66 67 68 69 70 71 72 73 74 75 76
	"UC during a demand linefill from L2",
	"Parity error during data load from IC",
	"Parity error for IC valid bit",
	"Main tag parity error",
	"Parity error in prediction queue",
	"PFB data/address parity error",
	"Parity error in the branch status reg",
	"PFB promotion address error",
	"Tag error during probe/victimization",
	"Parity error for IC probe tag valid bit",
	"PFB non-cacheable bit parity error",
	"PFB valid bit parity error",			/* xec = 0xd */
77
	"Microcode Patch Buffer",			/* xec = 010 */
78 79 80
	"uop queue",
	"insn buffer",
	"predecode buffer",
81 82
	"fetch address FIFO",
	"dispatch uop queue"
83 84
};

85
static const char * const f15h_mc2_mce_desc[] = {
86 87 88 89 90 91 92 93
	"Fill ECC error on data fills",			/* xec = 0x4 */
	"Fill parity error on insn fills",
	"Prefetcher request FIFO parity error",
	"PRQ address parity error",
	"PRQ data parity error",
	"WCC Tag ECC error",
	"WCC Data ECC error",
	"WCB Data parity error",
94
	"VB Data ECC or parity error",
95 96 97 98 99 100 101
	"L2 Tag ECC error",				/* xec = 0x10 */
	"Hard L2 Tag ECC error",
	"Multiple hits on L2 tag",
	"XAB parity error",
	"PRB address parity error"
};

102
static const char * const mc4_mce_desc[] = {
103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
	"DRAM ECC error detected on the NB",
	"CRC error detected on HT link",
	"Link-defined sync error packets detected on HT link",
	"HT Master abort",
	"HT Target abort",
	"Invalid GART PTE entry during GART table walk",
	"Unsupported atomic RMW received from an IO link",
	"Watchdog timeout due to lack of progress",
	"DRAM ECC error detected on the NB",
	"SVM DMA Exclusion Vector error",
	"HT data error detected on link",
	"Protocol error (link, L3, probe filter)",
	"NB internal arrays parity error",
	"DRAM addr/ctl signals parity error",
	"IO link transmission error",
	"L3 data cache ECC error",			/* xec = 0x1c */
	"L3 cache tag error",
	"L3 LRU parity bits error",
	"ECC Error in the Probe Filter directory"
};

124
static const char * const mc5_mce_desc[] = {
125 126 127 128 129 130 131 132 133 134 135 136
	"CPU Watchdog timer expire",
	"Wakeup array dest tag",
	"AG payload array",
	"EX payload array",
	"IDRF array",
	"Retire dispatch queue",
	"Mapper checkpoint array",
	"Physical register file EX0 port",
	"Physical register file EX1 port",
	"Physical register file AG0 port",
	"Physical register file AG1 port",
	"Flag register file",
137 138
	"DE error occurred",
	"Retire status queue"
139 140
};

141 142 143 144 145 146 147 148 149
static const char * const mc6_mce_desc[] = {
	"Hardware Assertion",
	"Free List",
	"Physical Register File",
	"Retire Queue",
	"Scheduler table",
	"Status Register File",
};

150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
/* Scalable MCA error strings */
static const char * const f17h_ls_mce_desc[] = {
	"Load queue parity",
	"Store queue parity",
	"Miss address buffer payload parity",
	"L1 TLB parity",
	"",						/* reserved */
	"DC tag error type 6",
	"DC tag error type 1",
	"Internal error type 1",
	"Internal error type 2",
	"Sys Read data error thread 0",
	"Sys read data error thread 1",
	"DC tag error type 2",
	"DC data error type 1 (poison comsumption)",
	"DC data error type 2",
	"DC data error type 3",
	"DC tag error type 4",
	"L2 TLB parity",
	"PDC parity error",
	"DC tag error type 3",
	"DC tag error type 5",
	"L2 fill data error",
};

static const char * const f17h_if_mce_desc[] = {
	"microtag probe port parity error",
	"IC microtag or full tag multi-hit error",
	"IC full tag parity",
	"IC data array parity",
	"Decoupling queue phys addr parity error",
	"L0 ITLB parity error",
	"L1 ITLB parity error",
	"L2 ITLB parity error",
	"BPQ snoop parity on Thread 0",
	"BPQ snoop parity on Thread 1",
	"L1 BTB multi-match error",
	"L2 BTB multi-match error",
188 189
	"L2 Cache Response Poison error",
	"System Read Data error",
190 191 192 193 194 195 196 197 198 199 200 201 202
};

static const char * const f17h_l2_mce_desc[] = {
	"L2M tag multi-way-hit error",
	"L2M tag ECC error",
	"L2M data ECC error",
	"HW assert",
};

static const char * const f17h_de_mce_desc[] = {
	"uop cache tag parity error",
	"uop cache data parity error",
	"Insn buffer parity error",
203
	"uop queue parity error",
204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219
	"Insn dispatch queue parity error",
	"Fetch address FIFO parity",
	"Patch RAM data parity",
	"Patch RAM sequencer parity",
	"uop buffer parity"
};

static const char * const f17h_ex_mce_desc[] = {
	"Watchdog timeout error",
	"Phy register file parity",
	"Flag register file parity",
	"Immediate displacement register file parity",
	"Address generator payload parity",
	"EX payload parity",
	"Checkpoint queue parity",
	"Retire dispatch queue parity",
220 221 222
	"Retire status queue parity error",
	"Scheduling queue parity error",
	"Branch buffer queue parity error",
223 224 225 226 227 228 229 230 231
};

static const char * const f17h_fp_mce_desc[] = {
	"Physical register file parity",
	"Freelist parity error",
	"Schedule queue parity",
	"NSQ parity error",
	"Retire queue parity",
	"Status register file parity",
232
	"Hardware assertion",
233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285
};

static const char * const f17h_l3_mce_desc[] = {
	"Shadow tag macro ECC error",
	"Shadow tag macro multi-way-hit error",
	"L3M tag ECC error",
	"L3M tag multi-way-hit error",
	"L3M data ECC error",
	"XI parity, L3 fill done channel error",
	"L3 victim queue parity",
	"L3 HW assert",
};

static const char * const f17h_cs_mce_desc[] = {
	"Illegal request from transport layer",
	"Address violation",
	"Security violation",
	"Illegal response from transport layer",
	"Unexpected response",
	"Parity error on incoming request or probe response data",
	"Parity error on incoming read response data",
	"Atomic request parity",
	"ECC error on probe filter access",
};

static const char * const f17h_pie_mce_desc[] = {
	"HW assert",
	"Internal PIE register security violation",
	"Error on GMI link",
	"Poison data written to internal PIE register",
};

static const char * const f17h_umc_mce_desc[] = {
	"DRAM ECC error",
	"Data poison error on DRAM",
	"SDP parity error",
	"Advanced peripheral bus error",
	"Command/address parity error",
	"Write data CRC error",
};

static const char * const f17h_pb_mce_desc[] = {
	"Parameter Block RAM ECC error",
};

static const char * const f17h_psp_mce_desc[] = {
	"PSP RAM ECC or parity error",
};

static const char * const f17h_smu_mce_desc[] = {
	"SMU RAM ECC or parity error",
};

286
static bool f12h_mc0_mce(u16 ec, u8 xec)
287
{
288
	bool ret = false;
289

290
	if (MEM_ERROR(ec)) {
291
		u8 ll = LL(ec);
292
		ret = true;
293

294 295 296
		if (ll == LL_L2)
			pr_cont("during L1 linefill from L2.\n");
		else if (ll == LL_L1)
297
			pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
298 299 300 301 302
		else
			ret = false;
	}
	return ret;
}
303

304
static bool f10h_mc0_mce(u16 ec, u8 xec)
305
{
306
	if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
307 308 309
		pr_cont("during data scrub.\n");
		return true;
	}
310
	return f12h_mc0_mce(ec, xec);
311 312
}

313
static bool k8_mc0_mce(u16 ec, u8 xec)
314 315 316 317 318
{
	if (BUS_ERROR(ec)) {
		pr_cont("during system linefill.\n");
		return true;
	}
319

320
	return f10h_mc0_mce(ec, xec);
321 322
}

323
static bool cat_mc0_mce(u16 ec, u8 xec)
324
{
325
	u8 r4	 = R4(ec);
326 327 328 329
	bool ret = true;

	if (MEM_ERROR(ec)) {

330
		if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349
			return false;

		switch (r4) {
		case R4_DRD:
		case R4_DWR:
			pr_cont("Data/Tag parity error due to %s.\n",
				(r4 == R4_DRD ? "load/hw prf" : "store"));
			break;
		case R4_EVICT:
			pr_cont("Copyback parity error on a tag miss.\n");
			break;
		case R4_SNOOP:
			pr_cont("Tag parity error during snoop.\n");
			break;
		default:
			ret = false;
		}
	} else if (BUS_ERROR(ec)) {

350
		if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374
			return false;

		pr_cont("System read data error on a ");

		switch (r4) {
		case R4_RD:
			pr_cont("TLB reload.\n");
			break;
		case R4_DWR:
			pr_cont("store.\n");
			break;
		case R4_DRD:
			pr_cont("load.\n");
			break;
		default:
			ret = false;
		}
	} else {
		ret = false;
	}

	return ret;
}

375
static bool f15h_mc0_mce(u16 ec, u8 xec)
376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412
{
	bool ret = true;

	if (MEM_ERROR(ec)) {

		switch (xec) {
		case 0x0:
			pr_cont("Data Array access error.\n");
			break;

		case 0x1:
			pr_cont("UC error during a linefill from L2/NB.\n");
			break;

		case 0x2:
		case 0x11:
			pr_cont("STQ access error.\n");
			break;

		case 0x3:
			pr_cont("SCB access error.\n");
			break;

		case 0x10:
			pr_cont("Tag error.\n");
			break;

		case 0x12:
			pr_cont("LDQ access error.\n");
			break;

		default:
			ret = false;
		}
	} else if (BUS_ERROR(ec)) {

		if (!xec)
413
			pr_cont("System Read Data Error.\n");
414
		else
415
			pr_cont(" Internal error condition type %d.\n", xec);
416 417 418 419 420 421
	} else if (INT_ERROR(ec)) {
		if (xec <= 0x1f)
			pr_cont("Hardware Assert.\n");
		else
			ret = false;

422 423 424 425 426 427
	} else
		ret = false;

	return ret;
}

428
static void decode_mc0_mce(struct mce *m)
429
{
430 431
	u16 ec = EC(m->status);
	u8 xec = XEC(m->status, xec_mask);
432

433
	pr_emerg(HW_ERR "MC0 Error: ");
434 435 436

	/* TLB error signatures are the same across families */
	if (TLB_ERROR(ec)) {
437
		if (TT(ec) == TT_DATA) {
438
			pr_cont("%s TLB %s.\n", LL_MSG(ec),
439 440
				((xec == 2) ? "locked miss"
					    : (xec ? "multimatch" : "parity")));
441 442
			return;
		}
443
	} else if (fam_ops->mc0_mce(ec, xec))
444 445
		;
	else
446
		pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
447 448
}

449
static bool k8_mc1_mce(u16 ec, u8 xec)
450
{
451
	u8 ll	 = LL(ec);
452
	bool ret = true;
453

454 455
	if (!MEM_ERROR(ec))
		return false;
456

457 458 459
	if (ll == 0x2)
		pr_cont("during a linefill from L2.\n");
	else if (ll == 0x1) {
460
		switch (R4(ec)) {
461 462 463
		case R4_IRD:
			pr_cont("Parity error during data load.\n");
			break;
464

465 466 467 468 469 470 471 472 473 474 475 476
		case R4_EVICT:
			pr_cont("Copyback Parity/Victim error.\n");
			break;

		case R4_SNOOP:
			pr_cont("Tag Snoop error.\n");
			break;

		default:
			ret = false;
			break;
		}
477
	} else
478
		ret = false;
479

480 481 482
	return ret;
}

483
static bool cat_mc1_mce(u16 ec, u8 xec)
484
{
485
	u8 r4    = R4(ec);
486
	bool ret = true;
487

488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503
	if (!MEM_ERROR(ec))
		return false;

	if (TT(ec) != TT_INSTR)
		return false;

	if (r4 == R4_IRD)
		pr_cont("Data/tag array parity error for a tag hit.\n");
	else if (r4 == R4_SNOOP)
		pr_cont("Tag error during snoop/victimization.\n");
	else if (xec == 0x0)
		pr_cont("Tag parity error from victim castout.\n");
	else if (xec == 0x2)
		pr_cont("Microcode patch RAM parity error.\n");
	else
		ret = false;
504 505 506 507

	return ret;
}

508
static bool f15h_mc1_mce(u16 ec, u8 xec)
509 510 511 512 513 514 515 516
{
	bool ret = true;

	if (!MEM_ERROR(ec))
		return false;

	switch (xec) {
	case 0x0 ... 0xa:
517
		pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
518 519 520
		break;

	case 0xd:
521
		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
522 523
		break;

524
	case 0x10:
525
		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
526 527
		break;

528
	case 0x11 ... 0x15:
529
		pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
530 531 532 533 534 535 536 537
		break;

	default:
		ret = false;
	}
	return ret;
}

538
static void decode_mc1_mce(struct mce *m)
539
{
540 541
	u16 ec = EC(m->status);
	u8 xec = XEC(m->status, xec_mask);
542

543
	pr_emerg(HW_ERR "MC1 Error: ");
544 545 546 547 548

	if (TLB_ERROR(ec))
		pr_cont("%s TLB %s.\n", LL_MSG(ec),
			(xec ? "multimatch" : "parity error"));
	else if (BUS_ERROR(ec)) {
549
		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
550 551

		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
552 553 554 555 556
	} else if (INT_ERROR(ec)) {
		if (xec <= 0x3f)
			pr_cont("Hardware Assert.\n");
		else
			goto wrong_mc1_mce;
557
	} else if (fam_ops->mc1_mce(ec, xec))
558 559
		;
	else
560 561 562 563 564 565
		goto wrong_mc1_mce;

	return;

wrong_mc1_mce:
	pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
566 567
}

568
static bool k8_mc2_mce(u16 ec, u8 xec)
569
{
570
	bool ret = true;
571 572 573 574 575 576

	if (xec == 0x1)
		pr_cont(" in the write data buffers.\n");
	else if (xec == 0x3)
		pr_cont(" in the victim data buffers.\n");
	else if (xec == 0x2 && MEM_ERROR(ec))
577
		pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
578 579
	else if (xec == 0x0) {
		if (TLB_ERROR(ec))
580 581
			pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
				TT_MSG(ec));
582 583
		else if (BUS_ERROR(ec))
			pr_cont(": %s/ECC error in data read from NB: %s.\n",
584
				R4_MSG(ec), PP_MSG(ec));
585
		else if (MEM_ERROR(ec)) {
586
			u8 r4 = R4(ec);
587

588
			if (r4 >= 0x7)
589
				pr_cont(": %s error during data copyback.\n",
590 591
					R4_MSG(ec));
			else if (r4 <= 0x1)
592
				pr_cont(": %s parity/ECC error during data "
593
					"access from L2.\n", R4_MSG(ec));
594
			else
595
				ret = false;
596
		} else
597
			ret = false;
598
	} else
599
		ret = false;
600

601
	return ret;
602 603
}

604
static bool f15h_mc2_mce(u16 ec, u8 xec)
605
{
606
	bool ret = true;
607 608 609 610 611 612 613

	if (TLB_ERROR(ec)) {
		if (xec == 0x0)
			pr_cont("Data parity TLB read error.\n");
		else if (xec == 0x1)
			pr_cont("Poison data provided for TLB fill.\n");
		else
614
			ret = false;
615 616
	} else if (BUS_ERROR(ec)) {
		if (xec > 2)
617
			ret = false;
618 619 620 621 622

		pr_cont("Error during attempted NB data read.\n");
	} else if (MEM_ERROR(ec)) {
		switch (xec) {
		case 0x4 ... 0xc:
623
			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
624 625 626
			break;

		case 0x10 ... 0x14:
627
			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
628 629 630
			break;

		default:
631
			ret = false;
632
		}
633 634 635 636 637
	} else if (INT_ERROR(ec)) {
		if (xec <= 0x3f)
			pr_cont("Hardware Assert.\n");
		else
			ret = false;
638 639
	}

640 641 642
	return ret;
}

643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683
static bool f16h_mc2_mce(u16 ec, u8 xec)
{
	u8 r4 = R4(ec);

	if (!MEM_ERROR(ec))
		return false;

	switch (xec) {
	case 0x04 ... 0x05:
		pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
		break;

	case 0x09 ... 0x0b:
	case 0x0d ... 0x0f:
		pr_cont("ECC error in L2 tag (%s).\n",
			((r4 == R4_GEN)   ? "BankReq" :
			((r4 == R4_SNOOP) ? "Prb"     : "Fill")));
		break;

	case 0x10 ... 0x19:
	case 0x1b:
		pr_cont("ECC error in L2 data array (%s).\n",
			(((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
			((r4 == R4_GEN)   ? "Attr" :
			((r4 == R4_EVICT) ? "Vict" : "Fill"))));
		break;

	case 0x1c ... 0x1d:
	case 0x1f:
		pr_cont("Parity error in L2 attribute bits (%s).\n",
			((r4 == R4_RD)  ? "Hit"  :
			((r4 == R4_GEN) ? "Attr" : "Fill")));
		break;

	default:
		return false;
	}

	return true;
}

684 685 686 687
static void decode_mc2_mce(struct mce *m)
{
	u16 ec = EC(m->status);
	u8 xec = XEC(m->status, xec_mask);
688

689 690 691 692
	pr_emerg(HW_ERR "MC2 Error: ");

	if (!fam_ops->mc2_mce(ec, xec))
		pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
693 694
}

695
static void decode_mc3_mce(struct mce *m)
696
{
697 698
	u16 ec = EC(m->status);
	u8 xec = XEC(m->status, xec_mask);
699

700
	if (boot_cpu_data.x86 >= 0x14) {
701
		pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
702 703 704
			 " please report on LKML.\n");
		return;
	}
705

706
	pr_emerg(HW_ERR "MC3 Error");
707 708

	if (xec == 0x0) {
709
		u8 r4 = R4(ec);
710

711
		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
712
			goto wrong_mc3_mce;
713

714
		pr_cont(" during %s.\n", R4_MSG(ec));
715
	} else
716
		goto wrong_mc3_mce;
717

718 719
	return;

720 721
 wrong_mc3_mce:
	pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
722 723
}

724
static void decode_mc4_mce(struct mce *m)
725
{
726 727 728 729 730
	struct cpuinfo_x86 *c = &boot_cpu_data;
	int node_id = amd_get_nb_id(m->extcpu);
	u16 ec = EC(m->status);
	u8 xec = XEC(m->status, 0x1f);
	u8 offset = 0;
731

732
	pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
733

734 735
	switch (xec) {
	case 0x0 ... 0xe:
736

737 738 739 740
		/* special handling for DRAM ECCs */
		if (xec == 0x0 || xec == 0x8) {
			/* no ECCs on F11h */
			if (c->x86 == 0x11)
741
				goto wrong_mc4_mce;
742

743
			pr_cont("%s.\n", mc4_mce_desc[xec]);
744

745 746 747 748
			if (nb_bus_decoder)
				nb_bus_decoder(node_id, m);
			return;
		}
749 750 751 752 753 754 755 756
		break;

	case 0xf:
		if (TLB_ERROR(ec))
			pr_cont("GART Table Walk data error.\n");
		else if (BUS_ERROR(ec))
			pr_cont("DMA Exclusion Vector Table Walk error.\n");
		else
757
			goto wrong_mc4_mce;
758
		return;
759

760
	case 0x19:
761
		if (boot_cpu_data.x86 == 0x15 || boot_cpu_data.x86 == 0x16)
762 763
			pr_cont("Compute Unit Data Error.\n");
		else
764
			goto wrong_mc4_mce;
765
		return;
766

767
	case 0x1c ... 0x1f:
768
		offset = 13;
769 770 771
		break;

	default:
772
		goto wrong_mc4_mce;
773
	}
774

775
	pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
776 777
	return;

778 779
 wrong_mc4_mce:
	pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
780 781
}

782
static void decode_mc5_mce(struct mce *m)
B
Borislav Petkov 已提交
783
{
784
	struct cpuinfo_x86 *c = &boot_cpu_data;
785
	u16 ec = EC(m->status);
786
	u8 xec = XEC(m->status, xec_mask);
787 788

	if (c->x86 == 0xf || c->x86 == 0x11)
789
		goto wrong_mc5_mce;
B
Borislav Petkov 已提交
790

791
	pr_emerg(HW_ERR "MC5 Error: ");
792

793 794 795 796 797 798 799 800
	if (INT_ERROR(ec)) {
		if (xec <= 0x1f) {
			pr_cont("Hardware Assert.\n");
			return;
		} else
			goto wrong_mc5_mce;
	}

801
	if (xec == 0x0 || xec == 0xc)
802
		pr_cont("%s.\n", mc5_mce_desc[xec]);
803
	else if (xec <= 0xd)
804
		pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
805
	else
806
		goto wrong_mc5_mce;
807 808

	return;
B
Borislav Petkov 已提交
809

810 811
 wrong_mc5_mce:
	pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
B
Borislav Petkov 已提交
812 813
}

814
static void decode_mc6_mce(struct mce *m)
815
{
816
	u8 xec = XEC(m->status, xec_mask);
817

818
	pr_emerg(HW_ERR "MC6 Error: ");
819

820
	if (xec > 0x5)
821
		goto wrong_mc6_mce;
822

823
	pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
824 825
	return;

826 827
 wrong_mc6_mce:
	pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
828 829
}

830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936
static void decode_f17h_core_errors(const char *ip_name, u8 xec,
				   unsigned int mca_type)
{
	const char * const *error_desc_array;
	size_t len;

	pr_emerg(HW_ERR "%s Error: ", ip_name);

	switch (mca_type) {
	case SMCA_LS:
		error_desc_array = f17h_ls_mce_desc;
		len = ARRAY_SIZE(f17h_ls_mce_desc) - 1;

		if (xec == 0x4) {
			pr_cont("Unrecognized LS MCA error code.\n");
			return;
		}
		break;

	case SMCA_IF:
		error_desc_array = f17h_if_mce_desc;
		len = ARRAY_SIZE(f17h_if_mce_desc) - 1;
		break;

	case SMCA_L2_CACHE:
		error_desc_array = f17h_l2_mce_desc;
		len = ARRAY_SIZE(f17h_l2_mce_desc) - 1;
		break;

	case SMCA_DE:
		error_desc_array = f17h_de_mce_desc;
		len = ARRAY_SIZE(f17h_de_mce_desc) - 1;
		break;

	case SMCA_EX:
		error_desc_array = f17h_ex_mce_desc;
		len = ARRAY_SIZE(f17h_ex_mce_desc) - 1;
		break;

	case SMCA_FP:
		error_desc_array = f17h_fp_mce_desc;
		len = ARRAY_SIZE(f17h_fp_mce_desc) - 1;
		break;

	case SMCA_L3_CACHE:
		error_desc_array = f17h_l3_mce_desc;
		len = ARRAY_SIZE(f17h_l3_mce_desc) - 1;
		break;

	default:
		pr_cont("Corrupted MCA core error info.\n");
		return;
	}

	if (xec > len) {
		pr_cont("Unrecognized %s MCA bank error code.\n",
			 amd_core_mcablock_names[mca_type]);
		return;
	}

	pr_cont("%s.\n", error_desc_array[xec]);
}

static void decode_df_errors(u8 xec, unsigned int mca_type)
{
	const char * const *error_desc_array;
	size_t len;

	pr_emerg(HW_ERR "Data Fabric Error: ");

	switch (mca_type) {
	case  SMCA_CS:
		error_desc_array = f17h_cs_mce_desc;
		len = ARRAY_SIZE(f17h_cs_mce_desc) - 1;
		break;

	case SMCA_PIE:
		error_desc_array = f17h_pie_mce_desc;
		len = ARRAY_SIZE(f17h_pie_mce_desc) - 1;
		break;

	default:
		pr_cont("Corrupted MCA Data Fabric info.\n");
		return;
	}

	if (xec > len) {
		pr_cont("Unrecognized %s MCA bank error code.\n",
			 amd_df_mcablock_names[mca_type]);
		return;
	}

	pr_cont("%s.\n", error_desc_array[xec]);
}

/* Decode errors according to Scalable MCA specification */
static void decode_smca_errors(struct mce *m)
{
	u32 addr = MSR_AMD64_SMCA_MCx_IPID(m->bank);
	unsigned int hwid, mca_type, i;
	u8 xec = XEC(m->status, xec_mask);
	const char * const *error_desc_array;
	const char *ip_name;
	u32 low, high;
	size_t len;

	if (rdmsr_safe(addr, &low, &high)) {
937
		pr_emerg(HW_ERR "Invalid IP block specified.\n");
938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000
		return;
	}

	hwid = high & MCI_IPID_HWID;
	mca_type = (high & MCI_IPID_MCATYPE) >> 16;

	pr_emerg(HW_ERR "MC%d IPID value: 0x%08x%08x\n", m->bank, high, low);

	/*
	 * Based on hwid and mca_type values, decode errors from respective IPs.
	 * Note: mca_type values make sense only in the context of an hwid.
	 */
	for (i = 0; i < ARRAY_SIZE(amd_hwids); i++)
		if (amd_hwids[i].hwid == hwid)
			break;

	switch (i) {
	case SMCA_F17H_CORE:
		ip_name = (mca_type == SMCA_L3_CACHE) ?
			  "L3 Cache" : "F17h Core";
		return decode_f17h_core_errors(ip_name, xec, mca_type);
		break;

	case SMCA_DF:
		return decode_df_errors(xec, mca_type);
		break;

	case SMCA_UMC:
		error_desc_array = f17h_umc_mce_desc;
		len = ARRAY_SIZE(f17h_umc_mce_desc) - 1;
		break;

	case SMCA_PB:
		error_desc_array = f17h_pb_mce_desc;
		len = ARRAY_SIZE(f17h_pb_mce_desc) - 1;
		break;

	case SMCA_PSP:
		error_desc_array = f17h_psp_mce_desc;
		len = ARRAY_SIZE(f17h_psp_mce_desc) - 1;
		break;

	case SMCA_SMU:
		error_desc_array = f17h_smu_mce_desc;
		len = ARRAY_SIZE(f17h_smu_mce_desc) - 1;
		break;

	default:
		pr_emerg(HW_ERR "HWID:%d does not match any existing IPs.\n", hwid);
		return;
	}

	ip_name = amd_hwids[i].name;
	pr_emerg(HW_ERR "%s Error: ", ip_name);

	if (xec > len) {
		pr_cont("Unrecognized %s MCA bank error code.\n", ip_name);
		return;
	}

	pr_cont("%s.\n", error_desc_array[xec]);
}

B
Borislav Petkov 已提交
1001
static inline void amd_decode_err_code(u16 ec)
1002
{
1003 1004 1005 1006
	if (INT_ERROR(ec)) {
		pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
		return;
	}
1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022

	pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));

	if (BUS_ERROR(ec))
		pr_cont(", mem/io: %s", II_MSG(ec));
	else
		pr_cont(", tx: %s", TT_MSG(ec));

	if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
		pr_cont(", mem-tx: %s", R4_MSG(ec));

		if (BUS_ERROR(ec))
			pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
	}

	pr_cont("\n");
1023 1024
}

1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040
/*
 * Filter out unwanted MCE signatures here.
 */
static bool amd_filter_mce(struct mce *m)
{
	u8 xec = (m->status >> 16) & 0x1f;

	/*
	 * NB GART TLB error reporting is disabled by default.
	 */
	if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
		return true;

	return false;
}

B
Borislav Petkov 已提交
1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056
static const char *decode_error_status(struct mce *m)
{
	if (m->status & MCI_STATUS_UC) {
		if (m->status & MCI_STATUS_PCC)
			return "System Fatal error.";
		if (m->mcgstatus & MCG_STATUS_RIPV)
			return "Uncorrected, software restartable error.";
		return "Uncorrected, software containable error.";
	}

	if (m->status & MCI_STATUS_DEFERRED)
		return "Deferred error.";

	return "Corrected error, no action required.";
}

B
Borislav Petkov 已提交
1057
int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
1058
{
1059
	struct mce *m = (struct mce *)data;
1060
	struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
1061
	int ecc;
1062

1063 1064 1065
	if (amd_filter_mce(m))
		return NOTIFY_STOP;

1066 1067 1068 1069 1070 1071 1072
	pr_emerg(HW_ERR "%s\n", decode_error_status(m));

	pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
		m->extcpu,
		c->x86, c->x86_model, c->x86_mask,
		m->bank,
		((m->status & MCI_STATUS_OVER)	? "Over"  : "-"),
1073 1074
		((m->status & MCI_STATUS_UC)	? "UE"	  :
		 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
1075 1076 1077 1078
		((m->status & MCI_STATUS_MISCV)	? "MiscV" : "-"),
		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"),
		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"));

1079
	if (c->x86 >= 0x15)
1080 1081 1082 1083
		pr_cont("|%s|%s",
			((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
			((m->status & MCI_STATUS_POISON)   ? "Poison"   : "-"));

1084
	if (boot_cpu_has(X86_FEATURE_SMCA)) {
1085 1086 1087
		u32 low, high;
		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);

1088 1089
		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));

1090 1091 1092 1093 1094
		if (!rdmsr_safe(addr, &low, &high) &&
		    (low & MCI_CONFIG_MCAX))
			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
	}

1095 1096 1097 1098 1099 1100 1101 1102
	/* do the two bits[14:13] together */
	ecc = (m->status >> 45) & 0x3;
	if (ecc)
		pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));

	pr_cont("]: 0x%016llx\n", m->status);

	if (m->status & MCI_STATUS_ADDRV)
1103
		pr_emerg(HW_ERR "Error Addr: 0x%016llx", m->addr);
1104

1105
	if (boot_cpu_has(X86_FEATURE_SMCA)) {
1106 1107 1108 1109 1110
		if (m->status & MCI_STATUS_SYNDV)
			pr_cont(", Syndrome: 0x%016llx", m->synd);

		pr_cont("\n");

1111 1112
		decode_smca_errors(m);
		goto err_code;
1113 1114
	} else
		pr_cont("\n");
1115

1116 1117 1118
	if (!fam_ops)
		goto err_code;

1119 1120
	switch (m->bank) {
	case 0:
1121
		decode_mc0_mce(m);
1122
		break;
1123

1124
	case 1:
1125
		decode_mc1_mce(m);
1126 1127
		break;

1128
	case 2:
1129
		decode_mc2_mce(m);
1130 1131
		break;

1132
	case 3:
1133
		decode_mc3_mce(m);
1134 1135
		break;

1136
	case 4:
1137
		decode_mc4_mce(m);
1138 1139
		break;

B
Borislav Petkov 已提交
1140
	case 5:
1141
		decode_mc5_mce(m);
B
Borislav Petkov 已提交
1142 1143
		break;

1144
	case 6:
1145
		decode_mc6_mce(m);
1146 1147
		break;

1148 1149
	default:
		break;
1150
	}
1151

1152
 err_code:
1153
	amd_decode_err_code(m->status & 0xffff);
1154 1155

	return NOTIFY_STOP;
1156
}
B
Borislav Petkov 已提交
1157
EXPORT_SYMBOL_GPL(amd_decode_mce);
1158

1159 1160 1161 1162
static struct notifier_block amd_mce_dec_nb = {
	.notifier_call	= amd_decode_mce,
};

1163 1164
static int __init mce_amd_init(void)
{
1165 1166 1167
	struct cpuinfo_x86 *c = &boot_cpu_data;

	if (c->x86_vendor != X86_VENDOR_AMD)
1168
		return -ENODEV;
1169

1170 1171 1172 1173
	fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
	if (!fam_ops)
		return -ENOMEM;

1174
	switch (c->x86) {
1175
	case 0xf:
1176 1177
		fam_ops->mc0_mce = k8_mc0_mce;
		fam_ops->mc1_mce = k8_mc1_mce;
1178
		fam_ops->mc2_mce = k8_mc2_mce;
1179 1180 1181
		break;

	case 0x10:
1182 1183
		fam_ops->mc0_mce = f10h_mc0_mce;
		fam_ops->mc1_mce = k8_mc1_mce;
1184
		fam_ops->mc2_mce = k8_mc2_mce;
1185 1186
		break;

1187
	case 0x11:
1188 1189
		fam_ops->mc0_mce = k8_mc0_mce;
		fam_ops->mc1_mce = k8_mc1_mce;
1190
		fam_ops->mc2_mce = k8_mc2_mce;
1191 1192
		break;

1193
	case 0x12:
1194 1195
		fam_ops->mc0_mce = f12h_mc0_mce;
		fam_ops->mc1_mce = k8_mc1_mce;
1196
		fam_ops->mc2_mce = k8_mc2_mce;
1197 1198
		break;

1199
	case 0x14:
1200 1201
		fam_ops->mc0_mce = cat_mc0_mce;
		fam_ops->mc1_mce = cat_mc1_mce;
1202
		fam_ops->mc2_mce = k8_mc2_mce;
1203 1204
		break;

1205
	case 0x15:
1206 1207
		xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;

1208 1209
		fam_ops->mc0_mce = f15h_mc0_mce;
		fam_ops->mc1_mce = f15h_mc1_mce;
1210
		fam_ops->mc2_mce = f15h_mc2_mce;
1211 1212
		break;

1213 1214 1215 1216 1217 1218 1219
	case 0x16:
		xec_mask = 0x1f;
		fam_ops->mc0_mce = cat_mc0_mce;
		fam_ops->mc1_mce = cat_mc1_mce;
		fam_ops->mc2_mce = f16h_mc2_mce;
		break;

1220 1221
	case 0x17:
		xec_mask = 0x3f;
1222
		if (!boot_cpu_has(X86_FEATURE_SMCA)) {
1223 1224 1225 1226 1227
			printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
			goto err_out;
		}
		break;

1228
	default:
1229
		printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1230
		goto err_out;
1231 1232
	}

1233 1234
	pr_info("MCE: In-kernel MCE decoding enabled.\n");

1235
	mce_register_decode_chain(&amd_mce_dec_nb);
1236 1237

	return 0;
1238 1239 1240 1241 1242

err_out:
	kfree(fam_ops);
	fam_ops = NULL;
	return -EINVAL;
1243 1244
}
early_initcall(mce_amd_init);
1245 1246 1247 1248

#ifdef MODULE
static void __exit mce_amd_exit(void)
{
1249
	mce_unregister_decode_chain(&amd_mce_dec_nb);
1250
	kfree(fam_ops);
1251 1252 1253 1254 1255 1256 1257
}

MODULE_DESCRIPTION("AMD MCE decoder");
MODULE_ALIAS("edac-mce-amd");
MODULE_LICENSE("GPL");
module_exit(mce_amd_exit);
#endif