提交 b4e8f0b6 编写于 作者: M Mauro Carvalho Chehab

i7core_edac: Use Device 3 function 2 to report errors with RDIMM's

Nehalem and upper chipsets provide an special device that has corrected memory
error counters detected with registered dimms. This device is only seen if
there are registered memories plugged.

After this patch, on a machine fully equiped with RDIMM's, it will use the
Device 3 function 2 to count corrected errors instead on relying at mcelog.

For unregistered DIMMs, it will keep the old behavior, counting errors
via mcelog.

This patch were developed together with Keith Mannthey <kmannth@us.ibm.com>
Signed-off-by: NKeith Mannthey <kmannth@us.ibm.com>
Signed-off-by: NMauro Carvalho Chehab <mchehab@redhat.com>
上级 61053fde
...@@ -73,6 +73,18 @@ ...@@ -73,6 +73,18 @@
#define DIMM1_COR_ERR(r) (((r) >> 16) & 0x7fff) #define DIMM1_COR_ERR(r) (((r) >> 16) & 0x7fff)
#define DIMM0_COR_ERR(r) ((r) & 0x7fff) #define DIMM0_COR_ERR(r) ((r) & 0x7fff)
/* OFFSETS for Device 3 Function 2, as inicated on Xeon 5500 datasheet */
#define MC_COR_ECC_CNT_0 0x80
#define MC_COR_ECC_CNT_1 0x84
#define MC_COR_ECC_CNT_2 0x88
#define MC_COR_ECC_CNT_3 0x8c
#define MC_COR_ECC_CNT_4 0x90
#define MC_COR_ECC_CNT_5 0x94
#define DIMM_TOP_COR_ERR(r) (((r) >> 16) & 0x7fff)
#define DIMM_BOT_COR_ERR(r) ((r) & 0x7fff)
/* OFFSETS for Devices 4,5 and 6 Function 0 */ /* OFFSETS for Devices 4,5 and 6 Function 0 */
#define MC_CHANNEL_DIMM_INIT_PARAMS 0x58 #define MC_CHANNEL_DIMM_INIT_PARAMS 0x58
...@@ -194,13 +206,20 @@ struct i7core_pvt { ...@@ -194,13 +206,20 @@ struct i7core_pvt {
struct i7core_inject inject; struct i7core_inject inject;
struct i7core_channel channel[NUM_SOCKETS][NUM_CHANS]; struct i7core_channel channel[NUM_SOCKETS][NUM_CHANS];
unsigned int is_registered:1; /* true if all memories are RDIMMs */
int sockets; /* Number of sockets */ int sockets; /* Number of sockets */
int channels; /* Number of active channels */ int channels; /* Number of active channels */
int ce_count_available[NUM_SOCKETS]; int ce_count_available[NUM_SOCKETS];
/* ECC corrected errors counts per dimm */ int csrow_map[NUM_SOCKETS][NUM_CHANS][MAX_DIMMS];
unsigned long ce_count[NUM_SOCKETS][MAX_DIMMS];
int last_ce_count[NUM_SOCKETS][MAX_DIMMS]; /* ECC corrected errors counts per udimm */
unsigned long udimm_ce_count[NUM_SOCKETS][MAX_DIMMS];
int udimm_last_ce_count[NUM_SOCKETS][MAX_DIMMS];
/* ECC corrected errors counts per rdimm */
unsigned long rdimm_ce_count[NUM_SOCKETS][NUM_CHANS][MAX_DIMMS];
int rdimm_last_ce_count[NUM_SOCKETS][NUM_CHANS][MAX_DIMMS];
/* mcelog glue */ /* mcelog glue */
struct edac_mce edac_mce; struct edac_mce edac_mce;
...@@ -471,6 +490,8 @@ static int get_dimm_config(struct mem_ctl_info *mci, int *csrow, u8 socket) ...@@ -471,6 +490,8 @@ static int get_dimm_config(struct mem_ctl_info *mci, int *csrow, u8 socket)
numrow(pvt->info.max_dod >> 6), numrow(pvt->info.max_dod >> 6),
numcol(pvt->info.max_dod >> 9)); numcol(pvt->info.max_dod >> 9));
pvt->is_registered = 1;
for (i = 0; i < NUM_CHANS; i++) { for (i = 0; i < NUM_CHANS; i++) {
u32 data, dimm_dod[3], value[8]; u32 data, dimm_dod[3], value[8];
...@@ -492,8 +513,14 @@ static int get_dimm_config(struct mem_ctl_info *mci, int *csrow, u8 socket) ...@@ -492,8 +513,14 @@ static int get_dimm_config(struct mem_ctl_info *mci, int *csrow, u8 socket)
if (data & REGISTERED_DIMM) if (data & REGISTERED_DIMM)
mtype = MEM_RDDR3; mtype = MEM_RDDR3;
else else {
mtype = MEM_DDR3; mtype = MEM_DDR3;
/*
* FIXME: Currently, the driver will use dev 3:2
* counter registers only if all memories are registered
*/
pvt->is_registered = 0;
}
#if 0 #if 0
if (data & THREE_DIMMS_PRESENT) if (data & THREE_DIMMS_PRESENT)
pvt->channel[i].dimms = 3; pvt->channel[i].dimms = 3;
...@@ -562,6 +589,8 @@ static int get_dimm_config(struct mem_ctl_info *mci, int *csrow, u8 socket) ...@@ -562,6 +589,8 @@ static int get_dimm_config(struct mem_ctl_info *mci, int *csrow, u8 socket)
csr->channels[0].chan_idx = i; csr->channels[0].chan_idx = i;
csr->channels[0].ce_count = 0; csr->channels[0].ce_count = 0;
pvt->csrow_map[socket][i][j] = *csrow;
switch (banks) { switch (banks) {
case 4: case 4:
csr->dtype = DEV_X4; csr->dtype = DEV_X4;
...@@ -1031,19 +1060,31 @@ static ssize_t i7core_inject_enable_show(struct mem_ctl_info *mci, ...@@ -1031,19 +1060,31 @@ static ssize_t i7core_inject_enable_show(struct mem_ctl_info *mci,
static ssize_t i7core_ce_regs_show(struct mem_ctl_info *mci, char *data) static ssize_t i7core_ce_regs_show(struct mem_ctl_info *mci, char *data)
{ {
unsigned i, count, total = 0; unsigned i, j, count, total = 0;
struct i7core_pvt *pvt = mci->pvt_info; struct i7core_pvt *pvt = mci->pvt_info;
for (i = 0; i < pvt->sockets; i++) { for (i = 0; i < pvt->sockets; i++) {
if (!pvt->ce_count_available[i]) if (!pvt->ce_count_available[i]) {
count = sprintf(data, "socket 0 data unavailable\n"); count = sprintf(data, "socket 0 data unavailable\n");
else continue;
}
if (!pvt->is_registered)
count = sprintf(data, "socket %d, dimm0: %lu\n" count = sprintf(data, "socket %d, dimm0: %lu\n"
"dimm1: %lu\ndimm2: %lu\n", "dimm1: %lu\ndimm2: %lu\n",
i, i,
pvt->ce_count[i][0], pvt->udimm_ce_count[i][0],
pvt->ce_count[i][1], pvt->udimm_ce_count[i][1],
pvt->ce_count[i][2]); pvt->udimm_ce_count[i][2]);
else
for (j = 0; j < NUM_CHANS; j++) {
count = sprintf(data, "socket %d, channel %d"
"dimm0: %lu\n"
"dimm1: %lu\ndimm2: %lu\n",
i, j,
pvt->rdimm_ce_count[i][j][0],
pvt->rdimm_ce_count[i][j][1],
pvt->rdimm_ce_count[i][j][2]);
}
data += count; data += count;
total += count; total += count;
} }
...@@ -1308,6 +1349,103 @@ static int mci_bind_devs(struct mem_ctl_info *mci) ...@@ -1308,6 +1349,103 @@ static int mci_bind_devs(struct mem_ctl_info *mci)
/**************************************************************************** /****************************************************************************
Error check routines Error check routines
****************************************************************************/ ****************************************************************************/
static void i7core_rdimm_update_csrow(struct mem_ctl_info *mci, int socket,
int chan, int dimm, int add)
{
char *msg;
struct i7core_pvt *pvt = mci->pvt_info;
int row = pvt->csrow_map[socket][chan][dimm], i;
for (i = 0; i < add; i++) {
msg = kasprintf(GFP_KERNEL, "Corrected error "
"(Socket=%d channel=%d dimm=%d",
socket, chan, dimm);
edac_mc_handle_fbd_ce(mci, row, 0, msg);
kfree (msg);
}
}
static void i7core_rdimm_update_ce_count(struct mem_ctl_info *mci,
int socket, int chan, int new0, int new1, int new2)
{
struct i7core_pvt *pvt = mci->pvt_info;
int add0 = 0, add1 = 0, add2 = 0;
/* Updates CE counters if it is not the first time here */
if (pvt->ce_count_available[socket]) {
/* Updates CE counters */
add2 = new2 - pvt->rdimm_last_ce_count[socket][chan][2];
add1 = new1 - pvt->rdimm_last_ce_count[socket][chan][1];
add0 = new0 - pvt->rdimm_last_ce_count[socket][chan][0];
if (add2 < 0)
add2 += 0x7fff;
pvt->rdimm_ce_count[socket][chan][2] += add2;
if (add1 < 0)
add1 += 0x7fff;
pvt->rdimm_ce_count[socket][chan][1] += add1;
if (add0 < 0)
add0 += 0x7fff;
pvt->rdimm_ce_count[socket][chan][0] += add0;
} else
pvt->ce_count_available[socket] = 1;
/* Store the new values */
pvt->rdimm_last_ce_count[socket][chan][2] = new2;
pvt->rdimm_last_ce_count[socket][chan][1] = new1;
pvt->rdimm_last_ce_count[socket][chan][0] = new0;
/*updated the edac core */
if (add0 != 0)
i7core_rdimm_update_csrow(mci, socket, chan, 0, add0);
if (add1 != 0)
i7core_rdimm_update_csrow(mci, socket, chan, 1, add1);
if (add2 != 0)
i7core_rdimm_update_csrow(mci, socket, chan, 2, add2);
}
static void i7core_rdimm_check_mc_ecc_err(struct mem_ctl_info *mci, u8 socket)
{
struct i7core_pvt *pvt = mci->pvt_info;
u32 rcv[3][2];
int i, new0, new1, new2;
/*Read DEV 3: FUN 2: MC_COR_ECC_CNT regs directly*/
pci_read_config_dword(pvt->pci_mcr[socket][2], MC_COR_ECC_CNT_0,
&rcv[0][0]);
pci_read_config_dword(pvt->pci_mcr[socket][2], MC_COR_ECC_CNT_1,
&rcv[0][1]);
pci_read_config_dword(pvt->pci_mcr[socket][2], MC_COR_ECC_CNT_2,
&rcv[1][0]);
pci_read_config_dword(pvt->pci_mcr[socket][2], MC_COR_ECC_CNT_3,
&rcv[1][1]);
pci_read_config_dword(pvt->pci_mcr[socket][2], MC_COR_ECC_CNT_4,
&rcv[2][0]);
pci_read_config_dword(pvt->pci_mcr[socket][2], MC_COR_ECC_CNT_5,
&rcv[2][1]);
for (i = 0 ; i < 3; i++) {
debugf3("MC_COR_ECC_CNT%d = 0x%x; MC_COR_ECC_CNT%d = 0x%x\n",
(i * 2), rcv[i][0], (i * 2) + 1, rcv[i][1]);
/*if the channel has 3 dimms*/
if (pvt->channel[socket][i].dimms > 2) {
new0 = DIMM_BOT_COR_ERR(rcv[i][0]);
new1 = DIMM_TOP_COR_ERR(rcv[i][0]);
new2 = DIMM_BOT_COR_ERR(rcv[i][1]);
} else {
new0 = DIMM_TOP_COR_ERR(rcv[i][0]) +
DIMM_BOT_COR_ERR(rcv[i][0]);
new1 = DIMM_TOP_COR_ERR(rcv[i][1]) +
DIMM_BOT_COR_ERR(rcv[i][1]);
new2 = 0;
}
i7core_rdimm_update_ce_count(mci, socket, i, new0, new1, new2);
}
}
/* This function is based on the device 3 function 4 registers as described on: /* This function is based on the device 3 function 4 registers as described on:
* Intel Xeon Processor 5500 Series Datasheet Volume 2 * Intel Xeon Processor 5500 Series Datasheet Volume 2
...@@ -1315,7 +1453,7 @@ static int mci_bind_devs(struct mem_ctl_info *mci) ...@@ -1315,7 +1453,7 @@ static int mci_bind_devs(struct mem_ctl_info *mci)
* also available at: * also available at:
* http://www.arrownac.com/manufacturers/intel/s/nehalem/5500-datasheet-v2.pdf * http://www.arrownac.com/manufacturers/intel/s/nehalem/5500-datasheet-v2.pdf
*/ */
static void check_mc_test_err(struct mem_ctl_info *mci, u8 socket) static void i7core_udimm_check_mc_ecc_err(struct mem_ctl_info *mci, u8 socket)
{ {
struct i7core_pvt *pvt = mci->pvt_info; struct i7core_pvt *pvt = mci->pvt_info;
u32 rcv1, rcv0; u32 rcv1, rcv0;
...@@ -1326,7 +1464,7 @@ static void check_mc_test_err(struct mem_ctl_info *mci, u8 socket) ...@@ -1326,7 +1464,7 @@ static void check_mc_test_err(struct mem_ctl_info *mci, u8 socket)
return; return;
} }
/* Corrected error reads */ /* Corrected test errors */
pci_read_config_dword(pvt->pci_mcr[socket][4], MC_TEST_ERR_RCV1, &rcv1); pci_read_config_dword(pvt->pci_mcr[socket][4], MC_TEST_ERR_RCV1, &rcv1);
pci_read_config_dword(pvt->pci_mcr[socket][4], MC_TEST_ERR_RCV0, &rcv0); pci_read_config_dword(pvt->pci_mcr[socket][4], MC_TEST_ERR_RCV0, &rcv0);
...@@ -1335,39 +1473,38 @@ static void check_mc_test_err(struct mem_ctl_info *mci, u8 socket) ...@@ -1335,39 +1473,38 @@ static void check_mc_test_err(struct mem_ctl_info *mci, u8 socket)
new1 = DIMM1_COR_ERR(rcv0); new1 = DIMM1_COR_ERR(rcv0);
new0 = DIMM0_COR_ERR(rcv0); new0 = DIMM0_COR_ERR(rcv0);
#if 0
debugf2("%s CE rcv1=0x%08x rcv0=0x%08x, %d %d %d\n",
(pvt->ce_count_available ? "UPDATE" : "READ"),
rcv1, rcv0, new0, new1, new2);
#endif
/* Updates CE counters if it is not the first time here */ /* Updates CE counters if it is not the first time here */
if (pvt->ce_count_available[socket]) { if (pvt->ce_count_available[socket]) {
/* Updates CE counters */ /* Updates CE counters */
int add0, add1, add2; int add0, add1, add2;
add2 = new2 - pvt->last_ce_count[socket][2]; add2 = new2 - pvt->udimm_last_ce_count[socket][2];
add1 = new1 - pvt->last_ce_count[socket][1]; add1 = new1 - pvt->udimm_last_ce_count[socket][1];
add0 = new0 - pvt->last_ce_count[socket][0]; add0 = new0 - pvt->udimm_last_ce_count[socket][0];
if (add2 < 0) if (add2 < 0)
add2 += 0x7fff; add2 += 0x7fff;
pvt->ce_count[socket][2] += add2; pvt->udimm_ce_count[socket][2] += add2;
if (add1 < 0) if (add1 < 0)
add1 += 0x7fff; add1 += 0x7fff;
pvt->ce_count[socket][1] += add1; pvt->udimm_ce_count[socket][1] += add1;
if (add0 < 0) if (add0 < 0)
add0 += 0x7fff; add0 += 0x7fff;
pvt->ce_count[socket][0] += add0; pvt->udimm_ce_count[socket][0] += add0;
if (add0 | add1 | add2)
i7core_printk(KERN_ERR, "New Corrected error(s): "
"dimm0: +%d, dimm1: +%d, dimm2 +%d\n",
add0, add1, add2);
} else } else
pvt->ce_count_available[socket] = 1; pvt->ce_count_available[socket] = 1;
/* Store the new values */ /* Store the new values */
pvt->last_ce_count[socket][2] = new2; pvt->udimm_last_ce_count[socket][2] = new2;
pvt->last_ce_count[socket][1] = new1; pvt->udimm_last_ce_count[socket][1] = new1;
pvt->last_ce_count[socket][0] = new0; pvt->udimm_last_ce_count[socket][0] = new0;
} }
/* /*
...@@ -1386,6 +1523,7 @@ static void check_mc_test_err(struct mem_ctl_info *mci, u8 socket) ...@@ -1386,6 +1523,7 @@ static void check_mc_test_err(struct mem_ctl_info *mci, u8 socket)
static void i7core_mce_output_error(struct mem_ctl_info *mci, static void i7core_mce_output_error(struct mem_ctl_info *mci,
struct mce *m) struct mce *m)
{ {
struct i7core_pvt *pvt = mci->pvt_info;
char *type, *optype, *err, *msg; char *type, *optype, *err, *msg;
unsigned long error = m->status & 0x1ff0000l; unsigned long error = m->status & 0x1ff0000l;
u32 optypenum = (m->status >> 4) & 0x07; u32 optypenum = (m->status >> 4) & 0x07;
...@@ -1394,6 +1532,7 @@ static void i7core_mce_output_error(struct mem_ctl_info *mci, ...@@ -1394,6 +1532,7 @@ static void i7core_mce_output_error(struct mem_ctl_info *mci,
u32 channel = (m->misc >> 18) & 0x3; u32 channel = (m->misc >> 18) & 0x3;
u32 syndrome = m->misc >> 32; u32 syndrome = m->misc >> 32;
u32 errnum = find_first_bit(&error, 32); u32 errnum = find_first_bit(&error, 32);
int csrow;
if (m->mcgstatus & 1) if (m->mcgstatus & 1)
type = "FATAL"; type = "FATAL";
...@@ -1463,9 +1602,15 @@ static void i7core_mce_output_error(struct mem_ctl_info *mci, ...@@ -1463,9 +1602,15 @@ static void i7core_mce_output_error(struct mem_ctl_info *mci,
debugf0("%s", msg); debugf0("%s", msg);
csrow = pvt->csrow_map[m->cpu][channel][dimm];
/* Call the helper to output message */ /* Call the helper to output message */
edac_mc_handle_fbd_ue(mci, 0 /* FIXME: should be rank here */, if (m->mcgstatus & 1)
0, 0 /* FIXME: should be channel here */, msg); edac_mc_handle_fbd_ue(mci, csrow, 0,
0 /* FIXME: should be channel here */, msg);
else if (!pvt->is_registered)
edac_mc_handle_fbd_ce(mci, csrow,
0 /* FIXME: should be channel here */, msg);
kfree(msg); kfree(msg);
} }
...@@ -1502,7 +1647,10 @@ static void i7core_check_error(struct mem_ctl_info *mci) ...@@ -1502,7 +1647,10 @@ static void i7core_check_error(struct mem_ctl_info *mci)
/* check memory count errors */ /* check memory count errors */
for (i = 0; i < pvt->sockets; i++) for (i = 0; i < pvt->sockets; i++)
check_mc_test_err(mci, i); if (!pvt->is_registered)
i7core_udimm_check_mc_ecc_err(mci, i);
else
i7core_rdimm_check_mc_ecc_err(mci, i);
} }
/* /*
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册