mce.h 11.0 KB
Newer Older
H
H. Peter Anvin 已提交
1 2
#ifndef _ASM_X86_MCE_H
#define _ASM_X86_MCE_H
3

4
#include <uapi/asm/mce.h>
5

6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Machine Check support for x86
 */

/* MCG_CAP register defines */
#define MCG_BANKCNT_MASK	0xff         /* Number of Banks */
#define MCG_CTL_P		(1ULL<<8)    /* MCG_CTL register available */
#define MCG_EXT_P		(1ULL<<9)    /* Extended registers available */
#define MCG_CMCI_P		(1ULL<<10)   /* CMCI supported */
#define MCG_EXT_CNT_MASK	0xff0000     /* Number of Extended registers */
#define MCG_EXT_CNT_SHIFT	16
#define MCG_EXT_CNT(c)		(((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
#define MCG_SER_P		(1ULL<<24)   /* MCA recovery/new status bits */
19
#define MCG_ELOG_P		(1ULL<<26)   /* Extended error log supported */
A
Ashok Raj 已提交
20
#define MCG_LMCE_P		(1ULL<<27)   /* Local machine check supported */
21 22 23 24 25

/* MCG_STATUS register defines */
#define MCG_STATUS_RIPV  (1ULL<<0)   /* restart ip valid */
#define MCG_STATUS_EIPV  (1ULL<<1)   /* ip points to correct instruction */
#define MCG_STATUS_MCIP  (1ULL<<2)   /* machine check in progress */
A
Ashok Raj 已提交
26 27 28 29
#define MCG_STATUS_LMCES (1ULL<<3)   /* LMCE signaled */

/* MCG_EXT_CTL register defines */
#define MCG_EXT_CTL_LMCE_EN (1ULL<<0) /* Enable LMCE */
30 31 32 33 34 35 36 37 38 39 40

/* MCi_STATUS register defines */
#define MCI_STATUS_VAL   (1ULL<<63)  /* valid error */
#define MCI_STATUS_OVER  (1ULL<<62)  /* previous errors lost */
#define MCI_STATUS_UC    (1ULL<<61)  /* uncorrected error */
#define MCI_STATUS_EN    (1ULL<<60)  /* error enabled */
#define MCI_STATUS_MISCV (1ULL<<59)  /* misc error reg. valid */
#define MCI_STATUS_ADDRV (1ULL<<58)  /* addr reg. valid */
#define MCI_STATUS_PCC   (1ULL<<57)  /* processor context corrupt */
#define MCI_STATUS_S	 (1ULL<<56)  /* Signaled machine check */
#define MCI_STATUS_AR	 (1ULL<<55)  /* Action required */
41

42 43 44
/* AMD-specific bits */
#define MCI_STATUS_DEFERRED	(1ULL<<44)  /* declare an uncorrected error */
#define MCI_STATUS_POISON	(1ULL<<43)  /* access poisonous data */
45 46 47 48 49 50 51 52 53 54 55 56
#define MCI_STATUS_TCC		(1ULL<<55)  /* Task context corrupt */

/*
 * McaX field if set indicates a given bank supports MCA extensions:
 *  - Deferred error interrupt type is specifiable by bank.
 *  - MCx_MISC0[BlkPtr] field indicates presence of extended MISC registers,
 *    But should not be used to determine MSR numbers.
 *  - TCC bit is present in MCx_STATUS.
 */
#define MCI_CONFIG_MCAX		0x1
#define MCI_IPID_MCATYPE	0xFFFF0000
#define MCI_IPID_HWID		0xFFF
57

58 59 60 61 62 63 64 65 66
/*
 * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
 * bits 15:0.  But bit 12 is the 'F' bit, defined for corrected
 * errors to indicate that errors are being filtered by hardware.
 * We should mask out bit 12 when looking for specific signatures
 * of uncorrected errors - so the F bit is deliberately skipped
 * in this #define.
 */
#define MCACOD		  0xefff     /* MCA Error Code */
67 68 69

/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */
#define MCACOD_SCRUB	0x00C0	/* 0xC0-0xCF Memory Scrubbing */
70
#define MCACOD_SCRUBMSK	0xeff0	/* Skip bit 12 ('F' bit) */
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
#define MCACOD_L3WB	0x017A	/* L3 Explicit Writeback */
#define MCACOD_DATA	0x0134	/* Data Load */
#define MCACOD_INSTR	0x0150	/* Instruction Fetch */

/* MCi_MISC register defines */
#define MCI_MISC_ADDR_LSB(m)	((m) & 0x3f)
#define MCI_MISC_ADDR_MODE(m)	(((m) >> 6) & 7)
#define  MCI_MISC_ADDR_SEGOFF	0	/* segment offset */
#define  MCI_MISC_ADDR_LINEAR	1	/* linear address */
#define  MCI_MISC_ADDR_PHYS	2	/* physical address */
#define  MCI_MISC_ADDR_MEM	3	/* memory address */
#define  MCI_MISC_ADDR_GENERIC	7	/* generic */

/* CTL2 register defines */
#define MCI_CTL2_CMCI_EN		(1ULL << 30)
#define MCI_CTL2_CMCI_THRESHOLD_MASK	0x7fffULL

#define MCJ_CTX_MASK		3
#define MCJ_CTX(flags)		((flags) & MCJ_CTX_MASK)
#define MCJ_CTX_RANDOM		0    /* inject context: random */
#define MCJ_CTX_PROCESS		0x1  /* inject context: process */
#define MCJ_CTX_IRQ		0x2  /* inject context: IRQ */
#define MCJ_NMI_BROADCAST	0x4  /* do NMI broadcasting */
#define MCJ_EXCEPTION		0x8  /* raise as exception */
M
Mathias Krause 已提交
95
#define MCJ_IRQ_BROADCAST	0x10 /* do IRQ broadcasting */
96 97 98 99 100 101 102 103 104 105

#define MCE_OVERFLOW 0		/* bit 0 in flags means overflow */

/* Software defined banks */
#define MCE_EXTENDED_BANK	128
#define MCE_THERMAL_BANK	(MCE_EXTENDED_BANK + 0)

#define MCE_LOG_LEN 32
#define MCE_LOG_SIGNATURE	"MACHINECHECK"

106
/* AMD Scalable MCA */
107
#define MSR_AMD64_SMCA_MC0_MISC0	0xc0002003
108
#define MSR_AMD64_SMCA_MC0_CONFIG	0xc0002004
109
#define MSR_AMD64_SMCA_MC0_IPID		0xc0002005
110 111
#define MSR_AMD64_SMCA_MC0_MISC1	0xc000200a
#define MSR_AMD64_SMCA_MCx_MISC(x)	(MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x))
112
#define MSR_AMD64_SMCA_MCx_CONFIG(x)	(MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
113
#define MSR_AMD64_SMCA_MCx_IPID(x)	(MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
114
#define MSR_AMD64_SMCA_MCx_MISCy(x, y)	((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
115

116 117 118 119 120 121 122 123 124 125 126 127 128 129
/*
 * This structure contains all data related to the MCE log.  Also
 * carries a signature to make it easier to find from external
 * debugging tools.  Each entry is only valid when its finished flag
 * is set.
 */
struct mce_log {
	char signature[12]; /* "MACHINECHECK" */
	unsigned len;	    /* = MCE_LOG_LEN */
	unsigned next;
	unsigned flags;
	unsigned recordlen;	/* length of struct mce */
	struct mce entry[MCE_LOG_LEN];
};
130 131 132

struct mca_config {
	bool dont_log_ce;
133
	bool cmci_disabled;
134
	bool lmce_disabled;
135
	bool ignore_ce;
136 137 138
	bool disabled;
	bool ser;
	bool bios_cmci_threshold;
139
	u8 banks;
140
	s8 bootlog;
141
	int tolerant;
142
	int monarch_timeout;
143
	int panic_timeout;
144
	u32 rip_msr;
145 146
};

147
struct mce_vendor_flags {
148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
	/*
	 * Indicates that overflow conditions are not fatal, when set.
	 */
	__u64 overflow_recov	: 1,

	/*
	 * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and
	 * Recovery. It indicates support for data poisoning in HW and deferred
	 * error interrupts.
	 */
	      succor		: 1,

	/*
	 * (AMD) SMCA: This bit indicates support for Scalable MCA which expands
	 * the register space for each MCA bank and also increases number of
	 * banks. Also, to accommodate the new banks and registers, the MCA
	 * register space is moved to a new MSR range.
	 */
	      smca		: 1,

	      __reserved_0	: 61;
169 170 171
};
extern struct mce_vendor_flags mce_flags;

172
extern struct mca_config mca_cfg;
173
extern void mce_register_decode_chain(struct notifier_block *nb);
174
extern void mce_unregister_decode_chain(struct notifier_block *nb);
175

H
Hidetoshi Seto 已提交
176
#include <linux/percpu.h>
A
Arun Sharma 已提交
177
#include <linux/atomic.h>
H
Hidetoshi Seto 已提交
178

179
extern int mce_p5_enabled;
180

H
Hidetoshi Seto 已提交
181
#ifdef CONFIG_X86_MCE
182
int mcheck_init(void);
183
void mcheck_cpu_init(struct cpuinfo_x86 *c);
184
void mcheck_cpu_clear(struct cpuinfo_x86 *c);
185
void mcheck_vendor_init_severity(void);
H
Hidetoshi Seto 已提交
186
#else
187
static inline int mcheck_init(void) { return 0; }
188
static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
189
static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {}
190
static inline void mcheck_vendor_init_severity(void) {}
H
Hidetoshi Seto 已提交
191 192
#endif

H
Hidetoshi Seto 已提交
193 194 195
#ifdef CONFIG_X86_ANCIENT_MCE
void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
void winchip_mcheck_init(struct cpuinfo_x86 *c);
196
static inline void enable_p5_mce(void) { mce_p5_enabled = 1; }
H
Hidetoshi Seto 已提交
197 198 199
#else
static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
200
static inline void enable_p5_mce(void) {}
H
Hidetoshi Seto 已提交
201 202
#endif

203
void mce_setup(struct mce *m);
204
void mce_log(struct mce *m);
205
DECLARE_PER_CPU(struct device *, mce_device);
206

A
Andi Kleen 已提交
207
/*
208 209 210
 * Maximum banks number.
 * This is the limit of the current register layout on
 * Intel CPUs.
A
Andi Kleen 已提交
211
 */
212
#define MAX_NR_BANKS 32
A
Andi Kleen 已提交
213

214 215
#ifdef CONFIG_X86_MCE_INTEL
void mce_intel_feature_init(struct cpuinfo_x86 *c);
216
void mce_intel_feature_clear(struct cpuinfo_x86 *c);
A
Andi Kleen 已提交
217 218
void cmci_clear(void);
void cmci_reenable(void);
219
void cmci_rediscover(void);
A
Andi Kleen 已提交
220
void cmci_recheck(void);
221 222
#else
static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
223
static inline void mce_intel_feature_clear(struct cpuinfo_x86 *c) { }
A
Andi Kleen 已提交
224 225
static inline void cmci_clear(void) {}
static inline void cmci_reenable(void) {}
226
static inline void cmci_rediscover(void) {}
A
Andi Kleen 已提交
227
static inline void cmci_recheck(void) {}
228 229 230 231 232 233 234 235
#endif

#ifdef CONFIG_X86_MCE_AMD
void mce_amd_feature_init(struct cpuinfo_x86 *c);
#else
static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
#endif

236
int mce_available(struct cpuinfo_x86 *c);
A
Andi Kleen 已提交
237

238
DECLARE_PER_CPU(unsigned, mce_exception_count);
239
DECLARE_PER_CPU(unsigned, mce_poll_count);
240

241 242 243
typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);

244
enum mcp_flags {
245 246 247
	MCP_TIMESTAMP	= BIT(0),	/* log time stamp */
	MCP_UC		= BIT(1),	/* log uncorrected errors */
	MCP_DONTLOG	= BIT(2),	/* only clear, don't log */
248
};
249
bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
250

251
int mce_notify_irq(void);
252

253
DECLARE_PER_CPU(struct mce, injectm);
254 255 256 257

extern void register_mce_write_callback(ssize_t (*)(struct file *filp,
				    const char __user *ubuf,
				    size_t usize, loff_t *off));
258

259 260 261
/* Disable CMCI/polling for MCA bank claimed by firmware */
extern void mce_disable_bank(int bank);

H
Hidetoshi Seto 已提交
262 263 264 265 266 267 268 269 270 271 272
/*
 * Exception handler
 */

/* Call the installed machine check handler for this CPU setup. */
extern void (*machine_check_vector)(struct pt_regs *, long error_code);
void do_machine_check(struct pt_regs *, long);

/*
 * Threshold handler
 */
273

274
extern void (*mce_threshold_vector)(void);
H
Hidetoshi Seto 已提交
275
extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
276

277 278 279
/* Deferred error interrupt handler */
extern void (*deferred_error_int_vector)(void);

280 281 282 283 284 285 286
/*
 * Thermal handler
 */

void intel_init_thermal(struct cpuinfo_x86 *c);

void mce_log_therm_throt_event(__u64 status);
287

288 289 290
/* Interrupt Handler for core thermal thresholds */
extern int (*platform_thermal_notify)(__u64 msr_val);

291 292 293 294 295 296 297
/* Interrupt Handler for package thermal thresholds */
extern int (*platform_thermal_package_notify)(__u64 msr_val);

/* Callback support of rate control, return true, if
 * callback has rate control */
extern bool (*platform_thermal_package_rate_control)(void);

298 299 300 301 302 303
#ifdef CONFIG_X86_THERMAL_VECTOR
extern void mcheck_intel_therm_init(void);
#else
static inline void mcheck_intel_therm_init(void) { }
#endif

304 305 306 307 308 309 310 311
/*
 * Used by APEI to report memory error via /dev/mcelog
 */

struct cper_sec_mem_err;
extern void apei_mce_report_mem_error(int corrected,
				      struct cper_sec_mem_err *mem_err);

312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356
/*
 * Enumerate new IP types and HWID values in AMD processors which support
 * Scalable MCA.
 */
#ifdef CONFIG_X86_MCE_AMD
enum amd_ip_types {
	SMCA_F17H_CORE = 0,	/* Core errors */
	SMCA_DF,		/* Data Fabric */
	SMCA_UMC,		/* Unified Memory Controller */
	SMCA_PB,		/* Parameter Block */
	SMCA_PSP,		/* Platform Security Processor */
	SMCA_SMU,		/* System Management Unit */
	N_AMD_IP_TYPES
};

struct amd_hwid {
	const char *name;
	unsigned int hwid;
};

extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES];

enum amd_core_mca_blocks {
	SMCA_LS = 0,	/* Load Store */
	SMCA_IF,	/* Instruction Fetch */
	SMCA_L2_CACHE,	/* L2 cache */
	SMCA_DE,	/* Decoder unit */
	RES,		/* Reserved */
	SMCA_EX,	/* Execution unit */
	SMCA_FP,	/* Floating Point */
	SMCA_L3_CACHE,	/* L3 cache */
	N_CORE_MCA_BLOCKS
};

extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS];

enum amd_df_mca_blocks {
	SMCA_CS = 0,	/* Coherent Slave */
	SMCA_PIE,	/* Power management, Interrupts, etc */
	N_DF_BLOCKS
};

extern const char * const amd_df_mcablock_names[N_DF_BLOCKS];
#endif

H
H. Peter Anvin 已提交
357
#endif /* _ASM_X86_MCE_H */