mce-severity.c 6.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
/*
 * MCE grading rules.
 * Copyright 2008, 2009 Intel Corporation.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; version 2
 * of the License.
 *
 * Author: Andi Kleen
 */
#include <linux/kernel.h>
13 14 15
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/debugfs.h>
16 17 18 19 20 21 22 23 24
#include <asm/mce.h>

#include "mce-internal.h"

/*
 * Grade an mce by severity. In general the most severe ones are processed
 * first. Since there are quite a lot of combinations test the bits in a
 * table-driven way. The rules are simply processed in order, first
 * match wins.
A
Andi Kleen 已提交
25 26 27 28 29
 *
 * Note this is only used for machine check exceptions, the corrected
 * errors use much simpler rules. The exceptions still check for the corrected
 * errors, but only to leave them alone for the CMCI handler (except for
 * panic situations)
30 31
 */

A
Andi Kleen 已提交
32 33 34
enum context { IN_KERNEL = 1, IN_USER = 2 };
enum ser { SER_REQUIRED = 1, NO_SER = 2 };

35 36 37 38 39 40
static struct severity {
	u64 mask;
	u64 result;
	unsigned char sev;
	unsigned char mcgmask;
	unsigned char mcgres;
A
Andi Kleen 已提交
41 42
	unsigned char ser;
	unsigned char context;
43
	unsigned char covered;
44 45
	char *msg;
} severities[] = {
46 47 48 49 50 51 52 53 54
#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
#define  KERNEL		.context = IN_KERNEL
#define  USER		.context = IN_USER
#define  SER		.ser = SER_REQUIRED
#define  NOSER		.ser = NO_SER
#define  BITCLR(x)	.mask = x, .result = 0
#define  BITSET(x)	.mask = x, .result = x
#define  MCGMASK(x, y)	.mcgmask = x, .mcgres = y
#define  MASK(x, y)	.mask = x, .result = y
A
Andi Kleen 已提交
55 56 57 58
#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
#define MCACOD 0xffff

59 60 61
	MCESEV(
		NO, "Invalid",
		BITCLR(MCI_STATUS_VAL)
62
		),
63 64 65
	MCESEV(
		NO, "Not enabled",
		BITCLR(MCI_STATUS_EN)
66
		),
67 68 69
	MCESEV(
		PANIC, "Processor context corrupt",
		BITSET(MCI_STATUS_PCC)
70
		),
A
Andi Kleen 已提交
71
	/* When MCIP is not set something is very confused */
72 73 74
	MCESEV(
		PANIC, "MCIP not set in MCA handler",
		MCGMASK(MCG_STATUS_MCIP, 0)
75
		),
A
Andi Kleen 已提交
76
	/* Neither return not error IP -- no chance to recover -> PANIC */
77 78 79
	MCESEV(
		PANIC, "Neither restart nor error IP",
		MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
80
		),
81
	MCESEV(
82
		PANIC, "In kernel and no restart IP",
83
		KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
84
		),
85
	MCESEV(
86
		KEEP, "Corrected error",
87
		NOSER, BITCLR(MCI_STATUS_UC)
88
		),
A
Andi Kleen 已提交
89 90

	/* ignore OVER for UCNA */
91
	MCESEV(
92
		KEEP, "Uncorrected no action required",
93
		SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
94
		),
95
	MCESEV(
96
		PANIC, "Illegal combination (UCNA with AR=1)",
97 98
		SER,
		MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
99
		),
100
	MCESEV(
101
		KEEP, "Non signalled machine check",
102
		SER, MASK(MCI_STATUS_S, 0)
103
		),
A
Andi Kleen 已提交
104 105

	/* AR add known MCACODs here */
106
	MCESEV(
107
		PANIC, "Action required with lost events",
108 109
		SER,
		MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR)
110
		),
111
	MCESEV(
112
		PANIC, "Action required; unknown MCACOD",
113
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
114
		),
A
Andi Kleen 已提交
115 116

	/* known AO MCACODs: */
117
	MCESEV(
118
		AO, "Action optional: memory scrubbing error",
119
		SER, MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0)
120
		),
121
	MCESEV(
122
		AO, "Action optional: last level cache writeback error",
123
		SER, MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a)
124
		),
125
	MCESEV(
126
		SOME, "Action optional unknown MCACOD",
127
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
128
		),
129
	MCESEV(
130
		SOME, "Action optional with lost events",
131
		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER)
132
		),
133 134 135 136

	MCESEV(
		PANIC, "Overflowed uncorrected",
		BITSET(MCI_STATUS_UC|MCI_STATUS_OVER)
137
		),
138 139 140
	MCESEV(
		UC, "Uncorrected",
		BITSET(MCI_STATUS_UC)
141
		),
142 143 144
	MCESEV(
		SOME, "No match",
		BITSET(0)
145
		)	/* always matches. keep at end */
146 147
};

A
Andi Kleen 已提交
148 149 150 151 152 153 154 155 156 157 158 159
/*
 * If the EIPV bit is set, it means the saved IP is the
 * instruction which caused the MCE.
 */
static int error_context(struct mce *m)
{
	if (m->mcgstatus & MCG_STATUS_EIPV)
		return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
	/* Unknown, assume kernel */
	return IN_KERNEL;
}

160 161
int mce_severity(struct mce *a, int tolerant, char **msg)
{
A
Andi Kleen 已提交
162
	enum context ctx = error_context(a);
163
	struct severity *s;
A
Andi Kleen 已提交
164

165 166 167 168 169
	for (s = severities;; s++) {
		if ((a->status & s->mask) != s->result)
			continue;
		if ((a->mcgstatus & s->mcgmask) != s->mcgres)
			continue;
A
Andi Kleen 已提交
170 171 172 173 174 175
		if (s->ser == SER_REQUIRED && !mce_ser)
			continue;
		if (s->ser == NO_SER && mce_ser)
			continue;
		if (s->context && ctx != s->context)
			continue;
176 177
		if (msg)
			*msg = s->msg;
178
		s->covered = 1;
A
Andi Kleen 已提交
179 180 181 182
		if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
			if (panic_on_oops || tolerant < 1)
				return MCE_PANIC_SEVERITY;
		}
183 184 185
		return s->sev;
	}
}
186

187
#ifdef CONFIG_DEBUG_FS
188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
static void *s_start(struct seq_file *f, loff_t *pos)
{
	if (*pos >= ARRAY_SIZE(severities))
		return NULL;
	return &severities[*pos];
}

static void *s_next(struct seq_file *f, void *data, loff_t *pos)
{
	if (++(*pos) >= ARRAY_SIZE(severities))
		return NULL;
	return &severities[*pos];
}

static void s_stop(struct seq_file *f, void *data)
{
}

static int s_show(struct seq_file *f, void *data)
{
	struct severity *ser = data;
	seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
	return 0;
}

static const struct seq_operations severities_seq_ops = {
	.start	= s_start,
	.next	= s_next,
	.stop	= s_stop,
	.show	= s_show,
};

static int severities_coverage_open(struct inode *inode, struct file *file)
{
	return seq_open(file, &severities_seq_ops);
}

static ssize_t severities_coverage_write(struct file *file,
					 const char __user *ubuf,
					 size_t count, loff_t *ppos)
{
	int i;
	for (i = 0; i < ARRAY_SIZE(severities); i++)
		severities[i].covered = 0;
	return count;
}

static const struct file_operations severities_coverage_fops = {
	.open		= severities_coverage_open,
	.release	= seq_release,
	.read		= seq_read,
	.write		= severities_coverage_write,
240
	.llseek		= seq_lseek,
241 242 243 244 245 246
};

static int __init severities_debugfs_init(void)
{
	struct dentry *dmce = NULL, *fseverities_coverage = NULL;

247
	dmce = mce_get_debugfs_dir();
248 249 250 251 252 253 254 255 256 257 258 259 260 261
	if (dmce == NULL)
		goto err_out;
	fseverities_coverage = debugfs_create_file("severities-coverage",
						   0444, dmce, NULL,
						   &severities_coverage_fops);
	if (fseverities_coverage == NULL)
		goto err_out;

	return 0;

err_out:
	return -ENOMEM;
}
late_initcall(severities_debugfs_init);
262
#endif