mce.h 11.8 KB
Newer Older
H
H. Peter Anvin 已提交
1 2
#ifndef _ASM_X86_MCE_H
#define _ASM_X86_MCE_H
3

4
#include <uapi/asm/mce.h>
5

6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Machine Check support for x86
 */

/* MCG_CAP register defines */
#define MCG_BANKCNT_MASK	0xff         /* Number of Banks */
#define MCG_CTL_P		(1ULL<<8)    /* MCG_CTL register available */
#define MCG_EXT_P		(1ULL<<9)    /* Extended registers available */
#define MCG_CMCI_P		(1ULL<<10)   /* CMCI supported */
#define MCG_EXT_CNT_MASK	0xff0000     /* Number of Extended registers */
#define MCG_EXT_CNT_SHIFT	16
#define MCG_EXT_CNT(c)		(((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
#define MCG_SER_P		(1ULL<<24)   /* MCA recovery/new status bits */
19
#define MCG_ELOG_P		(1ULL<<26)   /* Extended error log supported */
A
Ashok Raj 已提交
20
#define MCG_LMCE_P		(1ULL<<27)   /* Local machine check supported */
21 22 23 24 25

/* MCG_STATUS register defines */
#define MCG_STATUS_RIPV  (1ULL<<0)   /* restart ip valid */
#define MCG_STATUS_EIPV  (1ULL<<1)   /* ip points to correct instruction */
#define MCG_STATUS_MCIP  (1ULL<<2)   /* machine check in progress */
A
Ashok Raj 已提交
26 27 28 29
#define MCG_STATUS_LMCES (1ULL<<3)   /* LMCE signaled */

/* MCG_EXT_CTL register defines */
#define MCG_EXT_CTL_LMCE_EN (1ULL<<0) /* Enable LMCE */
30 31 32 33 34 35 36 37 38 39 40

/* MCi_STATUS register defines */
#define MCI_STATUS_VAL   (1ULL<<63)  /* valid error */
#define MCI_STATUS_OVER  (1ULL<<62)  /* previous errors lost */
#define MCI_STATUS_UC    (1ULL<<61)  /* uncorrected error */
#define MCI_STATUS_EN    (1ULL<<60)  /* error enabled */
#define MCI_STATUS_MISCV (1ULL<<59)  /* misc error reg. valid */
#define MCI_STATUS_ADDRV (1ULL<<58)  /* addr reg. valid */
#define MCI_STATUS_PCC   (1ULL<<57)  /* processor context corrupt */
#define MCI_STATUS_S	 (1ULL<<56)  /* Signaled machine check */
#define MCI_STATUS_AR	 (1ULL<<55)  /* Action required */
41

42
/* AMD-specific bits */
43
#define MCI_STATUS_DEFERRED	(1ULL<<44)  /* uncorrected error, deferred exception */
44
#define MCI_STATUS_POISON	(1ULL<<43)  /* access poisonous data */
45 46 47 48 49 50 51 52 53 54 55 56
#define MCI_STATUS_TCC		(1ULL<<55)  /* Task context corrupt */

/*
 * McaX field if set indicates a given bank supports MCA extensions:
 *  - Deferred error interrupt type is specifiable by bank.
 *  - MCx_MISC0[BlkPtr] field indicates presence of extended MISC registers,
 *    But should not be used to determine MSR numbers.
 *  - TCC bit is present in MCx_STATUS.
 */
#define MCI_CONFIG_MCAX		0x1
#define MCI_IPID_MCATYPE	0xFFFF0000
#define MCI_IPID_HWID		0xFFF
57

58 59 60 61 62 63 64 65 66
/*
 * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
 * bits 15:0.  But bit 12 is the 'F' bit, defined for corrected
 * errors to indicate that errors are being filtered by hardware.
 * We should mask out bit 12 when looking for specific signatures
 * of uncorrected errors - so the F bit is deliberately skipped
 * in this #define.
 */
#define MCACOD		  0xefff     /* MCA Error Code */
67 68 69

/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */
#define MCACOD_SCRUB	0x00C0	/* 0xC0-0xCF Memory Scrubbing */
70
#define MCACOD_SCRUBMSK	0xeff0	/* Skip bit 12 ('F' bit) */
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
#define MCACOD_L3WB	0x017A	/* L3 Explicit Writeback */
#define MCACOD_DATA	0x0134	/* Data Load */
#define MCACOD_INSTR	0x0150	/* Instruction Fetch */

/* MCi_MISC register defines */
#define MCI_MISC_ADDR_LSB(m)	((m) & 0x3f)
#define MCI_MISC_ADDR_MODE(m)	(((m) >> 6) & 7)
#define  MCI_MISC_ADDR_SEGOFF	0	/* segment offset */
#define  MCI_MISC_ADDR_LINEAR	1	/* linear address */
#define  MCI_MISC_ADDR_PHYS	2	/* physical address */
#define  MCI_MISC_ADDR_MEM	3	/* memory address */
#define  MCI_MISC_ADDR_GENERIC	7	/* generic */

/* CTL2 register defines */
#define MCI_CTL2_CMCI_EN		(1ULL << 30)
#define MCI_CTL2_CMCI_THRESHOLD_MASK	0x7fffULL

#define MCJ_CTX_MASK		3
#define MCJ_CTX(flags)		((flags) & MCJ_CTX_MASK)
#define MCJ_CTX_RANDOM		0    /* inject context: random */
#define MCJ_CTX_PROCESS		0x1  /* inject context: process */
#define MCJ_CTX_IRQ		0x2  /* inject context: IRQ */
#define MCJ_NMI_BROADCAST	0x4  /* do NMI broadcasting */
#define MCJ_EXCEPTION		0x8  /* raise as exception */
M
Mathias Krause 已提交
95
#define MCJ_IRQ_BROADCAST	0x10 /* do IRQ broadcasting */
96 97 98 99 100 101 102 103 104 105

#define MCE_OVERFLOW 0		/* bit 0 in flags means overflow */

/* Software defined banks */
#define MCE_EXTENDED_BANK	128
#define MCE_THERMAL_BANK	(MCE_EXTENDED_BANK + 0)

#define MCE_LOG_LEN 32
#define MCE_LOG_SIGNATURE	"MACHINECHECK"

106
/* AMD Scalable MCA */
107 108 109
#define MSR_AMD64_SMCA_MC0_CTL		0xc0002000
#define MSR_AMD64_SMCA_MC0_STATUS	0xc0002001
#define MSR_AMD64_SMCA_MC0_ADDR		0xc0002002
110
#define MSR_AMD64_SMCA_MC0_MISC0	0xc0002003
111
#define MSR_AMD64_SMCA_MC0_CONFIG	0xc0002004
112
#define MSR_AMD64_SMCA_MC0_IPID		0xc0002005
113 114
#define MSR_AMD64_SMCA_MC0_DESTAT	0xc0002008
#define MSR_AMD64_SMCA_MC0_DEADDR	0xc0002009
115
#define MSR_AMD64_SMCA_MC0_MISC1	0xc000200a
116 117 118
#define MSR_AMD64_SMCA_MCx_CTL(x)	(MSR_AMD64_SMCA_MC0_CTL + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_STATUS(x)	(MSR_AMD64_SMCA_MC0_STATUS + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_ADDR(x)	(MSR_AMD64_SMCA_MC0_ADDR + 0x10*(x))
119
#define MSR_AMD64_SMCA_MCx_MISC(x)	(MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x))
120
#define MSR_AMD64_SMCA_MCx_CONFIG(x)	(MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
121
#define MSR_AMD64_SMCA_MCx_IPID(x)	(MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
122 123
#define MSR_AMD64_SMCA_MCx_DESTAT(x)	(MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_DEADDR(x)	(MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x))
124
#define MSR_AMD64_SMCA_MCx_MISCy(x, y)	((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
125

126 127 128 129 130 131 132 133 134 135 136 137 138 139
/*
 * This structure contains all data related to the MCE log.  Also
 * carries a signature to make it easier to find from external
 * debugging tools.  Each entry is only valid when its finished flag
 * is set.
 */
struct mce_log {
	char signature[12]; /* "MACHINECHECK" */
	unsigned len;	    /* = MCE_LOG_LEN */
	unsigned next;
	unsigned flags;
	unsigned recordlen;	/* length of struct mce */
	struct mce entry[MCE_LOG_LEN];
};
140 141 142

struct mca_config {
	bool dont_log_ce;
143
	bool cmci_disabled;
144
	bool lmce_disabled;
145
	bool ignore_ce;
146 147
	bool disabled;
	bool ser;
148
	bool recovery;
149
	bool bios_cmci_threshold;
150
	u8 banks;
151
	s8 bootlog;
152
	int tolerant;
153
	int monarch_timeout;
154
	int panic_timeout;
155
	u32 rip_msr;
156 157
};

158
struct mce_vendor_flags {
159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
	/*
	 * Indicates that overflow conditions are not fatal, when set.
	 */
	__u64 overflow_recov	: 1,

	/*
	 * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and
	 * Recovery. It indicates support for data poisoning in HW and deferred
	 * error interrupts.
	 */
	      succor		: 1,

	/*
	 * (AMD) SMCA: This bit indicates support for Scalable MCA which expands
	 * the register space for each MCA bank and also increases number of
	 * banks. Also, to accommodate the new banks and registers, the MCA
	 * register space is moved to a new MSR range.
	 */
	      smca		: 1,

	      __reserved_0	: 61;
180
};
181 182 183 184 185 186 187 188

struct mca_msr_regs {
	u32 (*ctl)	(int bank);
	u32 (*status)	(int bank);
	u32 (*addr)	(int bank);
	u32 (*misc)	(int bank);
};

189 190
extern struct mce_vendor_flags mce_flags;

191
extern struct mca_config mca_cfg;
192
extern struct mca_msr_regs msr_ops;
193
extern void mce_register_decode_chain(struct notifier_block *nb);
194
extern void mce_unregister_decode_chain(struct notifier_block *nb);
195

H
Hidetoshi Seto 已提交
196
#include <linux/percpu.h>
A
Arun Sharma 已提交
197
#include <linux/atomic.h>
H
Hidetoshi Seto 已提交
198

199
extern int mce_p5_enabled;
200

H
Hidetoshi Seto 已提交
201
#ifdef CONFIG_X86_MCE
202
int mcheck_init(void);
203
void mcheck_cpu_init(struct cpuinfo_x86 *c);
204
void mcheck_cpu_clear(struct cpuinfo_x86 *c);
205
void mcheck_vendor_init_severity(void);
H
Hidetoshi Seto 已提交
206
#else
207
static inline int mcheck_init(void) { return 0; }
208
static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
209
static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {}
210
static inline void mcheck_vendor_init_severity(void) {}
H
Hidetoshi Seto 已提交
211 212
#endif

H
Hidetoshi Seto 已提交
213 214 215
#ifdef CONFIG_X86_ANCIENT_MCE
void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
void winchip_mcheck_init(struct cpuinfo_x86 *c);
216
static inline void enable_p5_mce(void) { mce_p5_enabled = 1; }
H
Hidetoshi Seto 已提交
217 218 219
#else
static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
220
static inline void enable_p5_mce(void) {}
H
Hidetoshi Seto 已提交
221 222
#endif

223
void mce_setup(struct mce *m);
224
void mce_log(struct mce *m);
225
DECLARE_PER_CPU(struct device *, mce_device);
226

A
Andi Kleen 已提交
227
/*
228 229 230
 * Maximum banks number.
 * This is the limit of the current register layout on
 * Intel CPUs.
A
Andi Kleen 已提交
231
 */
232
#define MAX_NR_BANKS 32
A
Andi Kleen 已提交
233

234 235
#ifdef CONFIG_X86_MCE_INTEL
void mce_intel_feature_init(struct cpuinfo_x86 *c);
236
void mce_intel_feature_clear(struct cpuinfo_x86 *c);
A
Andi Kleen 已提交
237 238
void cmci_clear(void);
void cmci_reenable(void);
239
void cmci_rediscover(void);
A
Andi Kleen 已提交
240
void cmci_recheck(void);
241 242
#else
static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
243
static inline void mce_intel_feature_clear(struct cpuinfo_x86 *c) { }
A
Andi Kleen 已提交
244 245
static inline void cmci_clear(void) {}
static inline void cmci_reenable(void) {}
246
static inline void cmci_rediscover(void) {}
A
Andi Kleen 已提交
247
static inline void cmci_recheck(void) {}
248 249 250 251 252 253 254 255
#endif

#ifdef CONFIG_X86_MCE_AMD
void mce_amd_feature_init(struct cpuinfo_x86 *c);
#else
static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
#endif

256
int mce_available(struct cpuinfo_x86 *c);
A
Andi Kleen 已提交
257

258
DECLARE_PER_CPU(unsigned, mce_exception_count);
259
DECLARE_PER_CPU(unsigned, mce_poll_count);
260

261 262 263
typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);

264
enum mcp_flags {
265 266 267
	MCP_TIMESTAMP	= BIT(0),	/* log time stamp */
	MCP_UC		= BIT(1),	/* log uncorrected errors */
	MCP_DONTLOG	= BIT(2),	/* only clear, don't log */
268
};
269
bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
270

271
int mce_notify_irq(void);
272

273
DECLARE_PER_CPU(struct mce, injectm);
274 275 276 277

extern void register_mce_write_callback(ssize_t (*)(struct file *filp,
				    const char __user *ubuf,
				    size_t usize, loff_t *off));
278

279 280 281
/* Disable CMCI/polling for MCA bank claimed by firmware */
extern void mce_disable_bank(int bank);

H
Hidetoshi Seto 已提交
282 283 284 285 286 287 288 289 290 291 292
/*
 * Exception handler
 */

/* Call the installed machine check handler for this CPU setup. */
extern void (*machine_check_vector)(struct pt_regs *, long error_code);
void do_machine_check(struct pt_regs *, long);

/*
 * Threshold handler
 */
293

294
extern void (*mce_threshold_vector)(void);
H
Hidetoshi Seto 已提交
295
extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
296

297 298 299
/* Deferred error interrupt handler */
extern void (*deferred_error_int_vector)(void);

300 301 302 303 304 305 306
/*
 * Thermal handler
 */

void intel_init_thermal(struct cpuinfo_x86 *c);

void mce_log_therm_throt_event(__u64 status);
307

308 309 310
/* Interrupt Handler for core thermal thresholds */
extern int (*platform_thermal_notify)(__u64 msr_val);

311 312 313 314 315 316 317
/* Interrupt Handler for package thermal thresholds */
extern int (*platform_thermal_package_notify)(__u64 msr_val);

/* Callback support of rate control, return true, if
 * callback has rate control */
extern bool (*platform_thermal_package_rate_control)(void);

318 319 320 321 322 323
#ifdef CONFIG_X86_THERMAL_VECTOR
extern void mcheck_intel_therm_init(void);
#else
static inline void mcheck_intel_therm_init(void) { }
#endif

324 325 326 327 328 329 330 331
/*
 * Used by APEI to report memory error via /dev/mcelog
 */

struct cper_sec_mem_err;
extern void apei_mce_report_mem_error(int corrected,
				      struct cper_sec_mem_err *mem_err);

332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
/*
 * Enumerate new IP types and HWID values in AMD processors which support
 * Scalable MCA.
 */
#ifdef CONFIG_X86_MCE_AMD
enum amd_ip_types {
	SMCA_F17H_CORE = 0,	/* Core errors */
	SMCA_DF,		/* Data Fabric */
	SMCA_UMC,		/* Unified Memory Controller */
	SMCA_PB,		/* Parameter Block */
	SMCA_PSP,		/* Platform Security Processor */
	SMCA_SMU,		/* System Management Unit */
	N_AMD_IP_TYPES
};

struct amd_hwid {
	const char *name;
	unsigned int hwid;
};

extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES];

enum amd_core_mca_blocks {
	SMCA_LS = 0,	/* Load Store */
	SMCA_IF,	/* Instruction Fetch */
	SMCA_L2_CACHE,	/* L2 cache */
	SMCA_DE,	/* Decoder unit */
	RES,		/* Reserved */
	SMCA_EX,	/* Execution unit */
	SMCA_FP,	/* Floating Point */
	SMCA_L3_CACHE,	/* L3 cache */
	N_CORE_MCA_BLOCKS
};

extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS];

enum amd_df_mca_blocks {
	SMCA_CS = 0,	/* Coherent Slave */
	SMCA_PIE,	/* Power management, Interrupts, etc */
	N_DF_BLOCKS
};

extern const char * const amd_df_mcablock_names[N_DF_BLOCKS];
#endif

H
H. Peter Anvin 已提交
377
#endif /* _ASM_X86_MCE_H */