mce.h 11.5 KB
Newer Older
H
H. Peter Anvin 已提交
1 2
#ifndef _ASM_X86_MCE_H
#define _ASM_X86_MCE_H
3

4
#include <uapi/asm/mce.h>
5

6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Machine Check support for x86
 */

/* MCG_CAP register defines */
#define MCG_BANKCNT_MASK	0xff         /* Number of Banks */
#define MCG_CTL_P		(1ULL<<8)    /* MCG_CTL register available */
#define MCG_EXT_P		(1ULL<<9)    /* Extended registers available */
#define MCG_CMCI_P		(1ULL<<10)   /* CMCI supported */
#define MCG_EXT_CNT_MASK	0xff0000     /* Number of Extended registers */
#define MCG_EXT_CNT_SHIFT	16
#define MCG_EXT_CNT(c)		(((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
#define MCG_SER_P		(1ULL<<24)   /* MCA recovery/new status bits */
19
#define MCG_ELOG_P		(1ULL<<26)   /* Extended error log supported */
A
Ashok Raj 已提交
20
#define MCG_LMCE_P		(1ULL<<27)   /* Local machine check supported */
21 22 23 24 25

/* MCG_STATUS register defines */
#define MCG_STATUS_RIPV  (1ULL<<0)   /* restart ip valid */
#define MCG_STATUS_EIPV  (1ULL<<1)   /* ip points to correct instruction */
#define MCG_STATUS_MCIP  (1ULL<<2)   /* machine check in progress */
A
Ashok Raj 已提交
26 27 28 29
#define MCG_STATUS_LMCES (1ULL<<3)   /* LMCE signaled */

/* MCG_EXT_CTL register defines */
#define MCG_EXT_CTL_LMCE_EN (1ULL<<0) /* Enable LMCE */
30 31 32 33 34 35 36 37 38 39 40

/* MCi_STATUS register defines */
#define MCI_STATUS_VAL   (1ULL<<63)  /* valid error */
#define MCI_STATUS_OVER  (1ULL<<62)  /* previous errors lost */
#define MCI_STATUS_UC    (1ULL<<61)  /* uncorrected error */
#define MCI_STATUS_EN    (1ULL<<60)  /* error enabled */
#define MCI_STATUS_MISCV (1ULL<<59)  /* misc error reg. valid */
#define MCI_STATUS_ADDRV (1ULL<<58)  /* addr reg. valid */
#define MCI_STATUS_PCC   (1ULL<<57)  /* processor context corrupt */
#define MCI_STATUS_S	 (1ULL<<56)  /* Signaled machine check */
#define MCI_STATUS_AR	 (1ULL<<55)  /* Action required */
41

42
/* AMD-specific bits */
43
#define MCI_STATUS_DEFERRED	(1ULL<<44)  /* uncorrected error, deferred exception */
44
#define MCI_STATUS_POISON	(1ULL<<43)  /* access poisonous data */
45 46 47 48 49 50 51 52 53 54 55 56
#define MCI_STATUS_TCC		(1ULL<<55)  /* Task context corrupt */

/*
 * McaX field if set indicates a given bank supports MCA extensions:
 *  - Deferred error interrupt type is specifiable by bank.
 *  - MCx_MISC0[BlkPtr] field indicates presence of extended MISC registers,
 *    But should not be used to determine MSR numbers.
 *  - TCC bit is present in MCx_STATUS.
 */
#define MCI_CONFIG_MCAX		0x1
#define MCI_IPID_MCATYPE	0xFFFF0000
#define MCI_IPID_HWID		0xFFF
57

58 59 60 61 62 63 64 65 66
/*
 * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
 * bits 15:0.  But bit 12 is the 'F' bit, defined for corrected
 * errors to indicate that errors are being filtered by hardware.
 * We should mask out bit 12 when looking for specific signatures
 * of uncorrected errors - so the F bit is deliberately skipped
 * in this #define.
 */
#define MCACOD		  0xefff     /* MCA Error Code */
67 68 69

/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */
#define MCACOD_SCRUB	0x00C0	/* 0xC0-0xCF Memory Scrubbing */
70
#define MCACOD_SCRUBMSK	0xeff0	/* Skip bit 12 ('F' bit) */
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
#define MCACOD_L3WB	0x017A	/* L3 Explicit Writeback */
#define MCACOD_DATA	0x0134	/* Data Load */
#define MCACOD_INSTR	0x0150	/* Instruction Fetch */

/* MCi_MISC register defines */
#define MCI_MISC_ADDR_LSB(m)	((m) & 0x3f)
#define MCI_MISC_ADDR_MODE(m)	(((m) >> 6) & 7)
#define  MCI_MISC_ADDR_SEGOFF	0	/* segment offset */
#define  MCI_MISC_ADDR_LINEAR	1	/* linear address */
#define  MCI_MISC_ADDR_PHYS	2	/* physical address */
#define  MCI_MISC_ADDR_MEM	3	/* memory address */
#define  MCI_MISC_ADDR_GENERIC	7	/* generic */

/* CTL2 register defines */
#define MCI_CTL2_CMCI_EN		(1ULL << 30)
#define MCI_CTL2_CMCI_THRESHOLD_MASK	0x7fffULL

#define MCJ_CTX_MASK		3
#define MCJ_CTX(flags)		((flags) & MCJ_CTX_MASK)
#define MCJ_CTX_RANDOM		0    /* inject context: random */
#define MCJ_CTX_PROCESS		0x1  /* inject context: process */
#define MCJ_CTX_IRQ		0x2  /* inject context: IRQ */
#define MCJ_NMI_BROADCAST	0x4  /* do NMI broadcasting */
#define MCJ_EXCEPTION		0x8  /* raise as exception */
M
Mathias Krause 已提交
95
#define MCJ_IRQ_BROADCAST	0x10 /* do IRQ broadcasting */
96 97 98 99 100 101 102 103 104 105

#define MCE_OVERFLOW 0		/* bit 0 in flags means overflow */

/* Software defined banks */
#define MCE_EXTENDED_BANK	128
#define MCE_THERMAL_BANK	(MCE_EXTENDED_BANK + 0)

#define MCE_LOG_LEN 32
#define MCE_LOG_SIGNATURE	"MACHINECHECK"

106
/* AMD Scalable MCA */
107 108 109
#define MSR_AMD64_SMCA_MC0_CTL		0xc0002000
#define MSR_AMD64_SMCA_MC0_STATUS	0xc0002001
#define MSR_AMD64_SMCA_MC0_ADDR		0xc0002002
110
#define MSR_AMD64_SMCA_MC0_MISC0	0xc0002003
111
#define MSR_AMD64_SMCA_MC0_CONFIG	0xc0002004
112
#define MSR_AMD64_SMCA_MC0_IPID		0xc0002005
113
#define MSR_AMD64_SMCA_MC0_MISC1	0xc000200a
114 115 116
#define MSR_AMD64_SMCA_MCx_CTL(x)	(MSR_AMD64_SMCA_MC0_CTL + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_STATUS(x)	(MSR_AMD64_SMCA_MC0_STATUS + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_ADDR(x)	(MSR_AMD64_SMCA_MC0_ADDR + 0x10*(x))
117
#define MSR_AMD64_SMCA_MCx_MISC(x)	(MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x))
118
#define MSR_AMD64_SMCA_MCx_CONFIG(x)	(MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
119
#define MSR_AMD64_SMCA_MCx_IPID(x)	(MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
120
#define MSR_AMD64_SMCA_MCx_MISCy(x, y)	((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
121

122 123 124 125 126 127 128 129 130 131 132 133 134 135
/*
 * This structure contains all data related to the MCE log.  Also
 * carries a signature to make it easier to find from external
 * debugging tools.  Each entry is only valid when its finished flag
 * is set.
 */
struct mce_log {
	char signature[12]; /* "MACHINECHECK" */
	unsigned len;	    /* = MCE_LOG_LEN */
	unsigned next;
	unsigned flags;
	unsigned recordlen;	/* length of struct mce */
	struct mce entry[MCE_LOG_LEN];
};
136 137 138

struct mca_config {
	bool dont_log_ce;
139
	bool cmci_disabled;
140
	bool lmce_disabled;
141
	bool ignore_ce;
142 143
	bool disabled;
	bool ser;
144
	bool recovery;
145
	bool bios_cmci_threshold;
146
	u8 banks;
147
	s8 bootlog;
148
	int tolerant;
149
	int monarch_timeout;
150
	int panic_timeout;
151
	u32 rip_msr;
152 153
};

154
struct mce_vendor_flags {
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
	/*
	 * Indicates that overflow conditions are not fatal, when set.
	 */
	__u64 overflow_recov	: 1,

	/*
	 * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and
	 * Recovery. It indicates support for data poisoning in HW and deferred
	 * error interrupts.
	 */
	      succor		: 1,

	/*
	 * (AMD) SMCA: This bit indicates support for Scalable MCA which expands
	 * the register space for each MCA bank and also increases number of
	 * banks. Also, to accommodate the new banks and registers, the MCA
	 * register space is moved to a new MSR range.
	 */
	      smca		: 1,

	      __reserved_0	: 61;
176
};
177 178 179 180 181 182 183 184

struct mca_msr_regs {
	u32 (*ctl)	(int bank);
	u32 (*status)	(int bank);
	u32 (*addr)	(int bank);
	u32 (*misc)	(int bank);
};

185 186
extern struct mce_vendor_flags mce_flags;

187
extern struct mca_config mca_cfg;
188
extern struct mca_msr_regs msr_ops;
189
extern void mce_register_decode_chain(struct notifier_block *nb);
190
extern void mce_unregister_decode_chain(struct notifier_block *nb);
191

H
Hidetoshi Seto 已提交
192
#include <linux/percpu.h>
A
Arun Sharma 已提交
193
#include <linux/atomic.h>
H
Hidetoshi Seto 已提交
194

195
extern int mce_p5_enabled;
196

H
Hidetoshi Seto 已提交
197
#ifdef CONFIG_X86_MCE
198
int mcheck_init(void);
199
void mcheck_cpu_init(struct cpuinfo_x86 *c);
200
void mcheck_cpu_clear(struct cpuinfo_x86 *c);
201
void mcheck_vendor_init_severity(void);
H
Hidetoshi Seto 已提交
202
#else
203
static inline int mcheck_init(void) { return 0; }
204
static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
205
static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {}
206
static inline void mcheck_vendor_init_severity(void) {}
H
Hidetoshi Seto 已提交
207 208
#endif

H
Hidetoshi Seto 已提交
209 210 211
#ifdef CONFIG_X86_ANCIENT_MCE
void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
void winchip_mcheck_init(struct cpuinfo_x86 *c);
212
static inline void enable_p5_mce(void) { mce_p5_enabled = 1; }
H
Hidetoshi Seto 已提交
213 214 215
#else
static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
216
static inline void enable_p5_mce(void) {}
H
Hidetoshi Seto 已提交
217 218
#endif

219
void mce_setup(struct mce *m);
220
void mce_log(struct mce *m);
221
DECLARE_PER_CPU(struct device *, mce_device);
222

A
Andi Kleen 已提交
223
/*
224 225 226
 * Maximum banks number.
 * This is the limit of the current register layout on
 * Intel CPUs.
A
Andi Kleen 已提交
227
 */
228
#define MAX_NR_BANKS 32
A
Andi Kleen 已提交
229

230 231
#ifdef CONFIG_X86_MCE_INTEL
void mce_intel_feature_init(struct cpuinfo_x86 *c);
232
void mce_intel_feature_clear(struct cpuinfo_x86 *c);
A
Andi Kleen 已提交
233 234
void cmci_clear(void);
void cmci_reenable(void);
235
void cmci_rediscover(void);
A
Andi Kleen 已提交
236
void cmci_recheck(void);
237 238
#else
static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
239
static inline void mce_intel_feature_clear(struct cpuinfo_x86 *c) { }
A
Andi Kleen 已提交
240 241
static inline void cmci_clear(void) {}
static inline void cmci_reenable(void) {}
242
static inline void cmci_rediscover(void) {}
A
Andi Kleen 已提交
243
static inline void cmci_recheck(void) {}
244 245 246 247 248 249 250 251
#endif

#ifdef CONFIG_X86_MCE_AMD
void mce_amd_feature_init(struct cpuinfo_x86 *c);
#else
static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
#endif

252
int mce_available(struct cpuinfo_x86 *c);
A
Andi Kleen 已提交
253

254
DECLARE_PER_CPU(unsigned, mce_exception_count);
255
DECLARE_PER_CPU(unsigned, mce_poll_count);
256

257 258 259
typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);

260
enum mcp_flags {
261 262 263
	MCP_TIMESTAMP	= BIT(0),	/* log time stamp */
	MCP_UC		= BIT(1),	/* log uncorrected errors */
	MCP_DONTLOG	= BIT(2),	/* only clear, don't log */
264
};
265
bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
266

267
int mce_notify_irq(void);
268

269
DECLARE_PER_CPU(struct mce, injectm);
270 271 272 273

extern void register_mce_write_callback(ssize_t (*)(struct file *filp,
				    const char __user *ubuf,
				    size_t usize, loff_t *off));
274

275 276 277
/* Disable CMCI/polling for MCA bank claimed by firmware */
extern void mce_disable_bank(int bank);

H
Hidetoshi Seto 已提交
278 279 280 281 282 283 284 285 286 287 288
/*
 * Exception handler
 */

/* Call the installed machine check handler for this CPU setup. */
extern void (*machine_check_vector)(struct pt_regs *, long error_code);
void do_machine_check(struct pt_regs *, long);

/*
 * Threshold handler
 */
289

290
extern void (*mce_threshold_vector)(void);
H
Hidetoshi Seto 已提交
291
extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
292

293 294 295
/* Deferred error interrupt handler */
extern void (*deferred_error_int_vector)(void);

296 297 298 299 300 301 302
/*
 * Thermal handler
 */

void intel_init_thermal(struct cpuinfo_x86 *c);

void mce_log_therm_throt_event(__u64 status);
303

304 305 306
/* Interrupt Handler for core thermal thresholds */
extern int (*platform_thermal_notify)(__u64 msr_val);

307 308 309 310 311 312 313
/* Interrupt Handler for package thermal thresholds */
extern int (*platform_thermal_package_notify)(__u64 msr_val);

/* Callback support of rate control, return true, if
 * callback has rate control */
extern bool (*platform_thermal_package_rate_control)(void);

314 315 316 317 318 319
#ifdef CONFIG_X86_THERMAL_VECTOR
extern void mcheck_intel_therm_init(void);
#else
static inline void mcheck_intel_therm_init(void) { }
#endif

320 321 322 323 324 325 326 327
/*
 * Used by APEI to report memory error via /dev/mcelog
 */

struct cper_sec_mem_err;
extern void apei_mce_report_mem_error(int corrected,
				      struct cper_sec_mem_err *mem_err);

328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372
/*
 * Enumerate new IP types and HWID values in AMD processors which support
 * Scalable MCA.
 */
#ifdef CONFIG_X86_MCE_AMD
enum amd_ip_types {
	SMCA_F17H_CORE = 0,	/* Core errors */
	SMCA_DF,		/* Data Fabric */
	SMCA_UMC,		/* Unified Memory Controller */
	SMCA_PB,		/* Parameter Block */
	SMCA_PSP,		/* Platform Security Processor */
	SMCA_SMU,		/* System Management Unit */
	N_AMD_IP_TYPES
};

struct amd_hwid {
	const char *name;
	unsigned int hwid;
};

extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES];

enum amd_core_mca_blocks {
	SMCA_LS = 0,	/* Load Store */
	SMCA_IF,	/* Instruction Fetch */
	SMCA_L2_CACHE,	/* L2 cache */
	SMCA_DE,	/* Decoder unit */
	RES,		/* Reserved */
	SMCA_EX,	/* Execution unit */
	SMCA_FP,	/* Floating Point */
	SMCA_L3_CACHE,	/* L3 cache */
	N_CORE_MCA_BLOCKS
};

extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS];

enum amd_df_mca_blocks {
	SMCA_CS = 0,	/* Coherent Slave */
	SMCA_PIE,	/* Power management, Interrupts, etc */
	N_DF_BLOCKS
};

extern const char * const amd_df_mcablock_names[N_DF_BLOCKS];
#endif

H
H. Peter Anvin 已提交
373
#endif /* _ASM_X86_MCE_H */