mce.h 11.9 KB
Newer Older
H
H. Peter Anvin 已提交
1 2
#ifndef _ASM_X86_MCE_H
#define _ASM_X86_MCE_H
3

4
#include <uapi/asm/mce.h>
5

6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Machine Check support for x86
 */

/* MCG_CAP register defines */
#define MCG_BANKCNT_MASK	0xff         /* Number of Banks */
#define MCG_CTL_P		(1ULL<<8)    /* MCG_CTL register available */
#define MCG_EXT_P		(1ULL<<9)    /* Extended registers available */
#define MCG_CMCI_P		(1ULL<<10)   /* CMCI supported */
#define MCG_EXT_CNT_MASK	0xff0000     /* Number of Extended registers */
#define MCG_EXT_CNT_SHIFT	16
#define MCG_EXT_CNT(c)		(((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
#define MCG_SER_P		(1ULL<<24)   /* MCA recovery/new status bits */
19
#define MCG_ELOG_P		(1ULL<<26)   /* Extended error log supported */
A
Ashok Raj 已提交
20
#define MCG_LMCE_P		(1ULL<<27)   /* Local machine check supported */
21 22 23 24 25

/* MCG_STATUS register defines */
#define MCG_STATUS_RIPV  (1ULL<<0)   /* restart ip valid */
#define MCG_STATUS_EIPV  (1ULL<<1)   /* ip points to correct instruction */
#define MCG_STATUS_MCIP  (1ULL<<2)   /* machine check in progress */
A
Ashok Raj 已提交
26 27 28 29
#define MCG_STATUS_LMCES (1ULL<<3)   /* LMCE signaled */

/* MCG_EXT_CTL register defines */
#define MCG_EXT_CTL_LMCE_EN (1ULL<<0) /* Enable LMCE */
30 31 32 33 34 35 36 37 38 39 40

/* MCi_STATUS register defines */
#define MCI_STATUS_VAL   (1ULL<<63)  /* valid error */
#define MCI_STATUS_OVER  (1ULL<<62)  /* previous errors lost */
#define MCI_STATUS_UC    (1ULL<<61)  /* uncorrected error */
#define MCI_STATUS_EN    (1ULL<<60)  /* error enabled */
#define MCI_STATUS_MISCV (1ULL<<59)  /* misc error reg. valid */
#define MCI_STATUS_ADDRV (1ULL<<58)  /* addr reg. valid */
#define MCI_STATUS_PCC   (1ULL<<57)  /* processor context corrupt */
#define MCI_STATUS_S	 (1ULL<<56)  /* Signaled machine check */
#define MCI_STATUS_AR	 (1ULL<<55)  /* Action required */
41

42
/* AMD-specific bits */
43 44
#define MCI_STATUS_TCC		(1ULL<<55)  /* Task context corrupt */
#define MCI_STATUS_SYNDV	(1ULL<<53)  /* synd reg. valid */
45
#define MCI_STATUS_DEFERRED	(1ULL<<44)  /* uncorrected error, deferred exception */
46
#define MCI_STATUS_POISON	(1ULL<<43)  /* access poisonous data */
47 48 49 50 51 52 53 54 55 56 57

/*
 * McaX field if set indicates a given bank supports MCA extensions:
 *  - Deferred error interrupt type is specifiable by bank.
 *  - MCx_MISC0[BlkPtr] field indicates presence of extended MISC registers,
 *    But should not be used to determine MSR numbers.
 *  - TCC bit is present in MCx_STATUS.
 */
#define MCI_CONFIG_MCAX		0x1
#define MCI_IPID_MCATYPE	0xFFFF0000
#define MCI_IPID_HWID		0xFFF
58

59 60 61 62 63 64 65 66 67
/*
 * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
 * bits 15:0.  But bit 12 is the 'F' bit, defined for corrected
 * errors to indicate that errors are being filtered by hardware.
 * We should mask out bit 12 when looking for specific signatures
 * of uncorrected errors - so the F bit is deliberately skipped
 * in this #define.
 */
#define MCACOD		  0xefff     /* MCA Error Code */
68 69 70

/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */
#define MCACOD_SCRUB	0x00C0	/* 0xC0-0xCF Memory Scrubbing */
71
#define MCACOD_SCRUBMSK	0xeff0	/* Skip bit 12 ('F' bit) */
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
#define MCACOD_L3WB	0x017A	/* L3 Explicit Writeback */
#define MCACOD_DATA	0x0134	/* Data Load */
#define MCACOD_INSTR	0x0150	/* Instruction Fetch */

/* MCi_MISC register defines */
#define MCI_MISC_ADDR_LSB(m)	((m) & 0x3f)
#define MCI_MISC_ADDR_MODE(m)	(((m) >> 6) & 7)
#define  MCI_MISC_ADDR_SEGOFF	0	/* segment offset */
#define  MCI_MISC_ADDR_LINEAR	1	/* linear address */
#define  MCI_MISC_ADDR_PHYS	2	/* physical address */
#define  MCI_MISC_ADDR_MEM	3	/* memory address */
#define  MCI_MISC_ADDR_GENERIC	7	/* generic */

/* CTL2 register defines */
#define MCI_CTL2_CMCI_EN		(1ULL << 30)
#define MCI_CTL2_CMCI_THRESHOLD_MASK	0x7fffULL

#define MCJ_CTX_MASK		3
#define MCJ_CTX(flags)		((flags) & MCJ_CTX_MASK)
#define MCJ_CTX_RANDOM		0    /* inject context: random */
#define MCJ_CTX_PROCESS		0x1  /* inject context: process */
#define MCJ_CTX_IRQ		0x2  /* inject context: IRQ */
#define MCJ_NMI_BROADCAST	0x4  /* do NMI broadcasting */
#define MCJ_EXCEPTION		0x8  /* raise as exception */
M
Mathias Krause 已提交
96
#define MCJ_IRQ_BROADCAST	0x10 /* do IRQ broadcasting */
97 98 99 100 101 102 103 104 105 106

#define MCE_OVERFLOW 0		/* bit 0 in flags means overflow */

/* Software defined banks */
#define MCE_EXTENDED_BANK	128
#define MCE_THERMAL_BANK	(MCE_EXTENDED_BANK + 0)

#define MCE_LOG_LEN 32
#define MCE_LOG_SIGNATURE	"MACHINECHECK"

107
/* AMD Scalable MCA */
108 109 110
#define MSR_AMD64_SMCA_MC0_CTL		0xc0002000
#define MSR_AMD64_SMCA_MC0_STATUS	0xc0002001
#define MSR_AMD64_SMCA_MC0_ADDR		0xc0002002
111
#define MSR_AMD64_SMCA_MC0_MISC0	0xc0002003
112
#define MSR_AMD64_SMCA_MC0_CONFIG	0xc0002004
113
#define MSR_AMD64_SMCA_MC0_IPID		0xc0002005
114
#define MSR_AMD64_SMCA_MC0_SYND		0xc0002006
115 116
#define MSR_AMD64_SMCA_MC0_DESTAT	0xc0002008
#define MSR_AMD64_SMCA_MC0_DEADDR	0xc0002009
117
#define MSR_AMD64_SMCA_MC0_MISC1	0xc000200a
118 119 120
#define MSR_AMD64_SMCA_MCx_CTL(x)	(MSR_AMD64_SMCA_MC0_CTL + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_STATUS(x)	(MSR_AMD64_SMCA_MC0_STATUS + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_ADDR(x)	(MSR_AMD64_SMCA_MC0_ADDR + 0x10*(x))
121
#define MSR_AMD64_SMCA_MCx_MISC(x)	(MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x))
122
#define MSR_AMD64_SMCA_MCx_CONFIG(x)	(MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
123
#define MSR_AMD64_SMCA_MCx_IPID(x)	(MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
124
#define MSR_AMD64_SMCA_MCx_SYND(x)	(MSR_AMD64_SMCA_MC0_SYND + 0x10*(x))
125 126
#define MSR_AMD64_SMCA_MCx_DESTAT(x)	(MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_DEADDR(x)	(MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x))
127
#define MSR_AMD64_SMCA_MCx_MISCy(x, y)	((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
128

129 130 131 132 133 134 135 136 137 138 139 140 141 142
/*
 * This structure contains all data related to the MCE log.  Also
 * carries a signature to make it easier to find from external
 * debugging tools.  Each entry is only valid when its finished flag
 * is set.
 */
struct mce_log {
	char signature[12]; /* "MACHINECHECK" */
	unsigned len;	    /* = MCE_LOG_LEN */
	unsigned next;
	unsigned flags;
	unsigned recordlen;	/* length of struct mce */
	struct mce entry[MCE_LOG_LEN];
};
143 144 145

struct mca_config {
	bool dont_log_ce;
146
	bool cmci_disabled;
147
	bool lmce_disabled;
148
	bool ignore_ce;
149 150
	bool disabled;
	bool ser;
151
	bool recovery;
152
	bool bios_cmci_threshold;
153
	u8 banks;
154
	s8 bootlog;
155
	int tolerant;
156
	int monarch_timeout;
157
	int panic_timeout;
158
	u32 rip_msr;
159 160
};

161
struct mce_vendor_flags {
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
	/*
	 * Indicates that overflow conditions are not fatal, when set.
	 */
	__u64 overflow_recov	: 1,

	/*
	 * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and
	 * Recovery. It indicates support for data poisoning in HW and deferred
	 * error interrupts.
	 */
	      succor		: 1,

	/*
	 * (AMD) SMCA: This bit indicates support for Scalable MCA which expands
	 * the register space for each MCA bank and also increases number of
	 * banks. Also, to accommodate the new banks and registers, the MCA
	 * register space is moved to a new MSR range.
	 */
	      smca		: 1,

	      __reserved_0	: 61;
183
};
184 185 186 187 188 189 190 191

struct mca_msr_regs {
	u32 (*ctl)	(int bank);
	u32 (*status)	(int bank);
	u32 (*addr)	(int bank);
	u32 (*misc)	(int bank);
};

192 193
extern struct mce_vendor_flags mce_flags;

194
extern struct mca_config mca_cfg;
195
extern struct mca_msr_regs msr_ops;
196
extern void mce_register_decode_chain(struct notifier_block *nb);
197
extern void mce_unregister_decode_chain(struct notifier_block *nb);
198

H
Hidetoshi Seto 已提交
199
#include <linux/percpu.h>
A
Arun Sharma 已提交
200
#include <linux/atomic.h>
H
Hidetoshi Seto 已提交
201

202
extern int mce_p5_enabled;
203

H
Hidetoshi Seto 已提交
204
#ifdef CONFIG_X86_MCE
205
int mcheck_init(void);
206
void mcheck_cpu_init(struct cpuinfo_x86 *c);
207
void mcheck_cpu_clear(struct cpuinfo_x86 *c);
208
void mcheck_vendor_init_severity(void);
H
Hidetoshi Seto 已提交
209
#else
210
static inline int mcheck_init(void) { return 0; }
211
static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
212
static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {}
213
static inline void mcheck_vendor_init_severity(void) {}
H
Hidetoshi Seto 已提交
214 215
#endif

H
Hidetoshi Seto 已提交
216 217 218
#ifdef CONFIG_X86_ANCIENT_MCE
void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
void winchip_mcheck_init(struct cpuinfo_x86 *c);
219
static inline void enable_p5_mce(void) { mce_p5_enabled = 1; }
H
Hidetoshi Seto 已提交
220 221 222
#else
static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
223
static inline void enable_p5_mce(void) {}
H
Hidetoshi Seto 已提交
224 225
#endif

226
void mce_setup(struct mce *m);
227
void mce_log(struct mce *m);
228
DECLARE_PER_CPU(struct device *, mce_device);
229

A
Andi Kleen 已提交
230
/*
231 232 233
 * Maximum banks number.
 * This is the limit of the current register layout on
 * Intel CPUs.
A
Andi Kleen 已提交
234
 */
235
#define MAX_NR_BANKS 32
A
Andi Kleen 已提交
236

237 238
#ifdef CONFIG_X86_MCE_INTEL
void mce_intel_feature_init(struct cpuinfo_x86 *c);
239
void mce_intel_feature_clear(struct cpuinfo_x86 *c);
A
Andi Kleen 已提交
240 241
void cmci_clear(void);
void cmci_reenable(void);
242
void cmci_rediscover(void);
A
Andi Kleen 已提交
243
void cmci_recheck(void);
244 245
#else
static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
246
static inline void mce_intel_feature_clear(struct cpuinfo_x86 *c) { }
A
Andi Kleen 已提交
247 248
static inline void cmci_clear(void) {}
static inline void cmci_reenable(void) {}
249
static inline void cmci_rediscover(void) {}
A
Andi Kleen 已提交
250
static inline void cmci_recheck(void) {}
251 252 253 254 255 256 257 258
#endif

#ifdef CONFIG_X86_MCE_AMD
void mce_amd_feature_init(struct cpuinfo_x86 *c);
#else
static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
#endif

259
int mce_available(struct cpuinfo_x86 *c);
A
Andi Kleen 已提交
260

261
DECLARE_PER_CPU(unsigned, mce_exception_count);
262
DECLARE_PER_CPU(unsigned, mce_poll_count);
263

264 265 266
typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);

267
enum mcp_flags {
268 269 270
	MCP_TIMESTAMP	= BIT(0),	/* log time stamp */
	MCP_UC		= BIT(1),	/* log uncorrected errors */
	MCP_DONTLOG	= BIT(2),	/* only clear, don't log */
271
};
272
bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
273

274
int mce_notify_irq(void);
275

276
DECLARE_PER_CPU(struct mce, injectm);
277 278 279 280

extern void register_mce_write_callback(ssize_t (*)(struct file *filp,
				    const char __user *ubuf,
				    size_t usize, loff_t *off));
281

282 283 284
/* Disable CMCI/polling for MCA bank claimed by firmware */
extern void mce_disable_bank(int bank);

H
Hidetoshi Seto 已提交
285 286 287 288 289 290 291 292 293 294 295
/*
 * Exception handler
 */

/* Call the installed machine check handler for this CPU setup. */
extern void (*machine_check_vector)(struct pt_regs *, long error_code);
void do_machine_check(struct pt_regs *, long);

/*
 * Threshold handler
 */
296

297
extern void (*mce_threshold_vector)(void);
H
Hidetoshi Seto 已提交
298
extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
299

300 301 302
/* Deferred error interrupt handler */
extern void (*deferred_error_int_vector)(void);

303 304 305 306 307 308 309
/*
 * Thermal handler
 */

void intel_init_thermal(struct cpuinfo_x86 *c);

void mce_log_therm_throt_event(__u64 status);
310

311 312 313
/* Interrupt Handler for core thermal thresholds */
extern int (*platform_thermal_notify)(__u64 msr_val);

314 315 316 317 318 319 320
/* Interrupt Handler for package thermal thresholds */
extern int (*platform_thermal_package_notify)(__u64 msr_val);

/* Callback support of rate control, return true, if
 * callback has rate control */
extern bool (*platform_thermal_package_rate_control)(void);

321 322 323 324 325 326
#ifdef CONFIG_X86_THERMAL_VECTOR
extern void mcheck_intel_therm_init(void);
#else
static inline void mcheck_intel_therm_init(void) { }
#endif

327 328 329 330 331 332 333 334
/*
 * Used by APEI to report memory error via /dev/mcelog
 */

struct cper_sec_mem_err;
extern void apei_mce_report_mem_error(int corrected,
				      struct cper_sec_mem_err *mem_err);

335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379
/*
 * Enumerate new IP types and HWID values in AMD processors which support
 * Scalable MCA.
 */
#ifdef CONFIG_X86_MCE_AMD
enum amd_ip_types {
	SMCA_F17H_CORE = 0,	/* Core errors */
	SMCA_DF,		/* Data Fabric */
	SMCA_UMC,		/* Unified Memory Controller */
	SMCA_PB,		/* Parameter Block */
	SMCA_PSP,		/* Platform Security Processor */
	SMCA_SMU,		/* System Management Unit */
	N_AMD_IP_TYPES
};

struct amd_hwid {
	const char *name;
	unsigned int hwid;
};

extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES];

enum amd_core_mca_blocks {
	SMCA_LS = 0,	/* Load Store */
	SMCA_IF,	/* Instruction Fetch */
	SMCA_L2_CACHE,	/* L2 cache */
	SMCA_DE,	/* Decoder unit */
	RES,		/* Reserved */
	SMCA_EX,	/* Execution unit */
	SMCA_FP,	/* Floating Point */
	SMCA_L3_CACHE,	/* L3 cache */
	N_CORE_MCA_BLOCKS
};

extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS];

enum amd_df_mca_blocks {
	SMCA_CS = 0,	/* Coherent Slave */
	SMCA_PIE,	/* Power management, Interrupts, etc */
	N_DF_BLOCKS
};

extern const char * const amd_df_mcablock_names[N_DF_BLOCKS];
#endif

H
H. Peter Anvin 已提交
380
#endif /* _ASM_X86_MCE_H */