mce-severity.c 6.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
/*
 * MCE grading rules.
 * Copyright 2008, 2009 Intel Corporation.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; version 2
 * of the License.
 *
 * Author: Andi Kleen
 */
#include <linux/kernel.h>
13 14 15
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/debugfs.h>
16 17 18 19 20 21 22 23 24
#include <asm/mce.h>

#include "mce-internal.h"

/*
 * Grade an mce by severity. In general the most severe ones are processed
 * first. Since there are quite a lot of combinations test the bits in a
 * table-driven way. The rules are simply processed in order, first
 * match wins.
A
Andi Kleen 已提交
25 26 27 28 29
 *
 * Note this is only used for machine check exceptions, the corrected
 * errors use much simpler rules. The exceptions still check for the corrected
 * errors, but only to leave them alone for the CMCI handler (except for
 * panic situations)
30 31
 */

A
Andi Kleen 已提交
32 33 34
enum context { IN_KERNEL = 1, IN_USER = 2 };
enum ser { SER_REQUIRED = 1, NO_SER = 2 };

35 36 37 38 39 40
static struct severity {
	u64 mask;
	u64 result;
	unsigned char sev;
	unsigned char mcgmask;
	unsigned char mcgres;
A
Andi Kleen 已提交
41 42
	unsigned char ser;
	unsigned char context;
43
	unsigned char covered;
44 45
	char *msg;
} severities[] = {
46 47 48 49 50 51 52 53 54
#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
#define  KERNEL		.context = IN_KERNEL
#define  USER		.context = IN_USER
#define  SER		.ser = SER_REQUIRED
#define  NOSER		.ser = NO_SER
#define  BITCLR(x)	.mask = x, .result = 0
#define  BITSET(x)	.mask = x, .result = x
#define  MCGMASK(x, y)	.mcgmask = x, .mcgres = y
#define  MASK(x, y)	.mask = x, .result = y
A
Andi Kleen 已提交
55 56
#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
57
#define	MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
A
Andi Kleen 已提交
58
#define MCACOD 0xffff
59 60 61 62 63 64
/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */
#define MCACOD_SCRUB	0x00C0	/* 0xC0-0xCF Memory Scrubbing */
#define MCACOD_SCRUBMSK	0xfff0
#define MCACOD_L3WB	0x017A	/* L3 Explicit Writeback */
#define MCACOD_DATA	0x0134	/* Data Load */
#define MCACOD_INSTR	0x0150	/* Instruction Fetch */
A
Andi Kleen 已提交
65

66 67 68
	MCESEV(
		NO, "Invalid",
		BITCLR(MCI_STATUS_VAL)
69
		),
70 71 72
	MCESEV(
		NO, "Not enabled",
		BITCLR(MCI_STATUS_EN)
73
		),
74 75 76
	MCESEV(
		PANIC, "Processor context corrupt",
		BITSET(MCI_STATUS_PCC)
77
		),
A
Andi Kleen 已提交
78
	/* When MCIP is not set something is very confused */
79 80 81
	MCESEV(
		PANIC, "MCIP not set in MCA handler",
		MCGMASK(MCG_STATUS_MCIP, 0)
82
		),
A
Andi Kleen 已提交
83
	/* Neither return not error IP -- no chance to recover -> PANIC */
84 85 86
	MCESEV(
		PANIC, "Neither restart nor error IP",
		MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
87
		),
88
	MCESEV(
89
		PANIC, "In kernel and no restart IP",
90
		KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
91
		),
92
	MCESEV(
93
		KEEP, "Corrected error",
94
		NOSER, BITCLR(MCI_STATUS_UC)
95
		),
A
Andi Kleen 已提交
96 97

	/* ignore OVER for UCNA */
98
	MCESEV(
99
		KEEP, "Uncorrected no action required",
100
		SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
101
		),
102
	MCESEV(
103
		PANIC, "Illegal combination (UCNA with AR=1)",
104 105
		SER,
		MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
106
		),
107
	MCESEV(
108
		KEEP, "Non signalled machine check",
109
		SER, BITCLR(MCI_STATUS_S)
110
		),
A
Andi Kleen 已提交
111

112
	MCESEV(
113
		PANIC, "Action required with lost events",
114
		SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
115
		),
116 117 118 119 120

	/* known AR MCACODs: */
#ifdef	CONFIG_MEMORY_FAILURE
	MCESEV(
		KEEP, "HT thread notices Action required: data load error",
121
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
122 123 124 125
		MCGMASK(MCG_STATUS_EIPV, 0)
		),
	MCESEV(
		AR, "Action required: data load error",
126
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
127 128 129
		USER
		),
#endif
130
	MCESEV(
131
		PANIC, "Action required: unknown MCACOD",
132
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
133
		),
A
Andi Kleen 已提交
134 135

	/* known AO MCACODs: */
136
	MCESEV(
137
		AO, "Action optional: memory scrubbing error",
138
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB)
139
		),
140
	MCESEV(
141
		AO, "Action optional: last level cache writeback error",
142
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB)
143
		),
144
	MCESEV(
145
		SOME, "Action optional: unknown MCACOD",
146
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
147
		),
148
	MCESEV(
149
		SOME, "Action optional with lost events",
150
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S)
151
		),
152 153 154

	MCESEV(
		PANIC, "Overflowed uncorrected",
155
		BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
156
		),
157 158 159
	MCESEV(
		UC, "Uncorrected",
		BITSET(MCI_STATUS_UC)
160
		),
161 162 163
	MCESEV(
		SOME, "No match",
		BITSET(0)
164
		)	/* always matches. keep at end */
165 166
};

A
Andi Kleen 已提交
167 168 169 170 171 172 173 174 175 176 177 178
/*
 * If the EIPV bit is set, it means the saved IP is the
 * instruction which caused the MCE.
 */
static int error_context(struct mce *m)
{
	if (m->mcgstatus & MCG_STATUS_EIPV)
		return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
	/* Unknown, assume kernel */
	return IN_KERNEL;
}

179
int mce_severity(struct mce *m, int tolerant, char **msg)
180
{
181
	enum context ctx = error_context(m);
182
	struct severity *s;
A
Andi Kleen 已提交
183

184
	for (s = severities;; s++) {
185
		if ((m->status & s->mask) != s->result)
186
			continue;
187
		if ((m->mcgstatus & s->mcgmask) != s->mcgres)
188
			continue;
A
Andi Kleen 已提交
189 190 191 192 193 194
		if (s->ser == SER_REQUIRED && !mce_ser)
			continue;
		if (s->ser == NO_SER && mce_ser)
			continue;
		if (s->context && ctx != s->context)
			continue;
195 196
		if (msg)
			*msg = s->msg;
197
		s->covered = 1;
A
Andi Kleen 已提交
198 199 200 201
		if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
			if (panic_on_oops || tolerant < 1)
				return MCE_PANIC_SEVERITY;
		}
202 203 204
		return s->sev;
	}
}
205

206
#ifdef CONFIG_DEBUG_FS
207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258
static void *s_start(struct seq_file *f, loff_t *pos)
{
	if (*pos >= ARRAY_SIZE(severities))
		return NULL;
	return &severities[*pos];
}

static void *s_next(struct seq_file *f, void *data, loff_t *pos)
{
	if (++(*pos) >= ARRAY_SIZE(severities))
		return NULL;
	return &severities[*pos];
}

static void s_stop(struct seq_file *f, void *data)
{
}

static int s_show(struct seq_file *f, void *data)
{
	struct severity *ser = data;
	seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
	return 0;
}

static const struct seq_operations severities_seq_ops = {
	.start	= s_start,
	.next	= s_next,
	.stop	= s_stop,
	.show	= s_show,
};

static int severities_coverage_open(struct inode *inode, struct file *file)
{
	return seq_open(file, &severities_seq_ops);
}

static ssize_t severities_coverage_write(struct file *file,
					 const char __user *ubuf,
					 size_t count, loff_t *ppos)
{
	int i;
	for (i = 0; i < ARRAY_SIZE(severities); i++)
		severities[i].covered = 0;
	return count;
}

static const struct file_operations severities_coverage_fops = {
	.open		= severities_coverage_open,
	.release	= seq_release,
	.read		= seq_read,
	.write		= severities_coverage_write,
259
	.llseek		= seq_lseek,
260 261 262 263
};

static int __init severities_debugfs_init(void)
{
264
	struct dentry *dmce, *fsev;
265

266
	dmce = mce_get_debugfs_dir();
267
	if (!dmce)
268
		goto err_out;
269 270 271 272

	fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL,
				   &severities_coverage_fops);
	if (!fsev)
273 274 275 276 277 278 279 280
		goto err_out;

	return 0;

err_out:
	return -ENOMEM;
}
late_initcall(severities_debugfs_init);
281
#endif /* CONFIG_DEBUG_FS */