mce-severity.c 7.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
/*
 * MCE grading rules.
 * Copyright 2008, 2009 Intel Corporation.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; version 2
 * of the License.
 *
 * Author: Andi Kleen
 */
#include <linux/kernel.h>
13 14 15
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/debugfs.h>
16 17 18 19 20 21 22 23 24
#include <asm/mce.h>

#include "mce-internal.h"

/*
 * Grade an mce by severity. In general the most severe ones are processed
 * first. Since there are quite a lot of combinations test the bits in a
 * table-driven way. The rules are simply processed in order, first
 * match wins.
A
Andi Kleen 已提交
25 26 27 28 29
 *
 * Note this is only used for machine check exceptions, the corrected
 * errors use much simpler rules. The exceptions still check for the corrected
 * errors, but only to leave them alone for the CMCI handler (except for
 * panic situations)
30 31
 */

A
Andi Kleen 已提交
32 33
enum context { IN_KERNEL = 1, IN_USER = 2 };
enum ser { SER_REQUIRED = 1, NO_SER = 2 };
34
enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
A
Andi Kleen 已提交
35

36 37 38 39 40 41
static struct severity {
	u64 mask;
	u64 result;
	unsigned char sev;
	unsigned char mcgmask;
	unsigned char mcgres;
A
Andi Kleen 已提交
42 43
	unsigned char ser;
	unsigned char context;
44
	unsigned char excp;
45
	unsigned char covered;
46 47
	char *msg;
} severities[] = {
48 49 50 51 52
#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
#define  KERNEL		.context = IN_KERNEL
#define  USER		.context = IN_USER
#define  SER		.ser = SER_REQUIRED
#define  NOSER		.ser = NO_SER
53 54
#define  EXCP		.excp = EXCP_CONTEXT
#define  NOEXCP		.excp = NO_EXCP
55 56 57 58
#define  BITCLR(x)	.mask = x, .result = 0
#define  BITSET(x)	.mask = x, .result = x
#define  MCGMASK(x, y)	.mcgmask = x, .mcgres = y
#define  MASK(x, y)	.mask = x, .result = y
A
Andi Kleen 已提交
59 60
#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
61
#define	MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
A
Andi Kleen 已提交
62

63 64 65
	MCESEV(
		NO, "Invalid",
		BITCLR(MCI_STATUS_VAL)
66
		),
67 68
	MCESEV(
		NO, "Not enabled",
69
		EXCP, BITCLR(MCI_STATUS_EN)
70
		),
71 72 73
	MCESEV(
		PANIC, "Processor context corrupt",
		BITSET(MCI_STATUS_PCC)
74
		),
A
Andi Kleen 已提交
75
	/* When MCIP is not set something is very confused */
76 77
	MCESEV(
		PANIC, "MCIP not set in MCA handler",
78
		EXCP, MCGMASK(MCG_STATUS_MCIP, 0)
79
		),
A
Andi Kleen 已提交
80
	/* Neither return not error IP -- no chance to recover -> PANIC */
81 82
	MCESEV(
		PANIC, "Neither restart nor error IP",
83
		EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
84
		),
85
	MCESEV(
86
		PANIC, "In kernel and no restart IP",
87 88 89 90 91
		EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
		),
	MCESEV(
		DEFERRED, "Deferred error",
		NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
92
		),
93
	MCESEV(
94
		KEEP, "Corrected error",
95
		NOSER, BITCLR(MCI_STATUS_UC)
96
		),
A
Andi Kleen 已提交
97 98

	/* ignore OVER for UCNA */
99
	MCESEV(
100
		UCNA, "Uncorrected no action required",
101
		SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
102
		),
103
	MCESEV(
104
		PANIC, "Illegal combination (UCNA with AR=1)",
105 106
		SER,
		MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
107
		),
108
	MCESEV(
109
		KEEP, "Non signalled machine check",
110
		SER, BITCLR(MCI_STATUS_S)
111
		),
A
Andi Kleen 已提交
112

113
	MCESEV(
114
		PANIC, "Action required with lost events",
115
		SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
116
		),
117 118 119 120

	/* known AR MCACODs: */
#ifdef	CONFIG_MEMORY_FAILURE
	MCESEV(
121
		KEEP, "Action required but unaffected thread is continuable",
122 123
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR),
		MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
124 125
		),
	MCESEV(
126
		AR, "Action required: data load error in a user process",
127
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
128 129
		USER
		),
130
	MCESEV(
131
		AR, "Action required: instruction fetch error in a user process",
132 133 134
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
		USER
		),
135
#endif
136
	MCESEV(
137
		PANIC, "Action required: unknown MCACOD",
138
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
139
		),
A
Andi Kleen 已提交
140 141

	/* known AO MCACODs: */
142
	MCESEV(
143
		AO, "Action optional: memory scrubbing error",
144
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB)
145
		),
146
	MCESEV(
147
		AO, "Action optional: last level cache writeback error",
148
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB)
149
		),
150
	MCESEV(
151
		SOME, "Action optional: unknown MCACOD",
152
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
153
		),
154
	MCESEV(
155
		SOME, "Action optional with lost events",
156
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S)
157
		),
158 159 160

	MCESEV(
		PANIC, "Overflowed uncorrected",
161
		BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
162
		),
163 164 165
	MCESEV(
		UC, "Uncorrected",
		BITSET(MCI_STATUS_UC)
166
		),
167 168 169
	MCESEV(
		SOME, "No match",
		BITSET(0)
170
		)	/* always matches. keep at end */
171 172
};

A
Andi Kleen 已提交
173
/*
174 175 176 177 178 179 180 181 182
 * If mcgstatus indicated that ip/cs on the stack were
 * no good, then "m->cs" will be zero and we will have
 * to assume the worst case (IN_KERNEL) as we actually
 * have no idea what we were executing when the machine
 * check hit.
 * If we do have a good "m->cs" (or a faked one in the
 * case we were executing in VM86 mode) we can use it to
 * distinguish an exception taken in user from from one
 * taken in the kernel.
A
Andi Kleen 已提交
183 184 185
 */
static int error_context(struct mce *m)
{
186
	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
A
Andi Kleen 已提交
187 188
}

189
int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
190
{
191
	enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
192
	enum context ctx = error_context(m);
193
	struct severity *s;
A
Andi Kleen 已提交
194

195
	for (s = severities;; s++) {
196
		if ((m->status & s->mask) != s->result)
197
			continue;
198
		if ((m->mcgstatus & s->mcgmask) != s->mcgres)
199
			continue;
200
		if (s->ser == SER_REQUIRED && !mca_cfg.ser)
A
Andi Kleen 已提交
201
			continue;
202
		if (s->ser == NO_SER && mca_cfg.ser)
A
Andi Kleen 已提交
203 204 205
			continue;
		if (s->context && ctx != s->context)
			continue;
206 207
		if (s->excp && excp != s->excp)
			continue;
208 209
		if (msg)
			*msg = s->msg;
210
		s->covered = 1;
A
Andi Kleen 已提交
211 212 213 214
		if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
			if (panic_on_oops || tolerant < 1)
				return MCE_PANIC_SEVERITY;
		}
215 216 217
		return s->sev;
	}
}
218

219
#ifdef CONFIG_DEBUG_FS
220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
static void *s_start(struct seq_file *f, loff_t *pos)
{
	if (*pos >= ARRAY_SIZE(severities))
		return NULL;
	return &severities[*pos];
}

static void *s_next(struct seq_file *f, void *data, loff_t *pos)
{
	if (++(*pos) >= ARRAY_SIZE(severities))
		return NULL;
	return &severities[*pos];
}

static void s_stop(struct seq_file *f, void *data)
{
}

static int s_show(struct seq_file *f, void *data)
{
	struct severity *ser = data;
	seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
	return 0;
}

static const struct seq_operations severities_seq_ops = {
	.start	= s_start,
	.next	= s_next,
	.stop	= s_stop,
	.show	= s_show,
};

static int severities_coverage_open(struct inode *inode, struct file *file)
{
	return seq_open(file, &severities_seq_ops);
}

static ssize_t severities_coverage_write(struct file *file,
					 const char __user *ubuf,
					 size_t count, loff_t *ppos)
{
	int i;
	for (i = 0; i < ARRAY_SIZE(severities); i++)
		severities[i].covered = 0;
	return count;
}

static const struct file_operations severities_coverage_fops = {
	.open		= severities_coverage_open,
	.release	= seq_release,
	.read		= seq_read,
	.write		= severities_coverage_write,
272
	.llseek		= seq_lseek,
273 274 275 276
};

static int __init severities_debugfs_init(void)
{
277
	struct dentry *dmce, *fsev;
278

279
	dmce = mce_get_debugfs_dir();
280
	if (!dmce)
281
		goto err_out;
282 283 284 285

	fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL,
				   &severities_coverage_fops);
	if (!fsev)
286 287 288 289 290 291 292 293
		goto err_out;

	return 0;

err_out:
	return -ENOMEM;
}
late_initcall(severities_debugfs_init);
294
#endif /* CONFIG_DEBUG_FS */