From b70ef01016850de87b9a28a6af19fed8801df076 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 25 Jun 2009 19:32:38 +0200 Subject: [PATCH] EDAC: move MCE error descriptions to EDAC core This is in preparation of adding AMD-specific MCE decoding functionality to the EDAC core. The error decoding macros originate from the AMD64 EDAC driver albeit in a simplified and cleaned up version here. While at it, add macros to generate the error description strings and use them in the error type decoders directly which removes a bunch of code and makes the decoding functions much more readable. Also, fix strings and shorten macro names. Remove superfluous htlink_msgs. Signed-off-by: Borislav Petkov --- drivers/edac/Makefile | 6 +- drivers/edac/amd64_edac.c | 140 ++++++++++-------- drivers/edac/amd64_edac.h | 17 +-- ...{amd64_edac_err_types.c => edac_mce_amd.c} | 78 ++-------- drivers/edac/edac_mce_amd.h | 29 ++++ 5 files changed, 122 insertions(+), 148 deletions(-) rename drivers/edac/{amd64_edac_err_types.c => edac_mce_amd.c} (61%) create mode 100644 drivers/edac/edac_mce_amd.h diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index 98aa4a7db412..cfa033ce53a7 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -17,6 +17,10 @@ ifdef CONFIG_PCI edac_core-objs += edac_pci.o edac_pci_sysfs.o endif +ifdef CONFIG_CPU_SUP_AMD +edac_core-objs += edac_mce_amd.o +endif + obj-$(CONFIG_EDAC_AMD76X) += amd76x_edac.o obj-$(CONFIG_EDAC_CPC925) += cpc925_edac.o obj-$(CONFIG_EDAC_I5000) += i5000_edac.o @@ -32,7 +36,7 @@ obj-$(CONFIG_EDAC_X38) += x38_edac.o obj-$(CONFIG_EDAC_I82860) += i82860_edac.o obj-$(CONFIG_EDAC_R82600) += r82600_edac.o -amd64_edac_mod-y := amd64_edac_err_types.o amd64_edac.o +amd64_edac_mod-y := amd64_edac.o amd64_edac_mod-$(CONFIG_EDAC_DEBUG) += amd64_edac_dbg.o amd64_edac_mod-$(CONFIG_EDAC_AMD64_ERROR_INJECTION) += amd64_edac_inj.o diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index e2a10bcba7a1..b9e84bc91766 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -18,6 +18,63 @@ struct amd64_pvt; static struct mem_ctl_info *mci_lookup[MAX_NUMNODES]; static struct amd64_pvt *pvt_lookup[MAX_NUMNODES]; +/* + * See F2x80 for K8 and F2x[1,0]80 for Fam10 and later. The table below is only + * for DDR2 DRAM mapping. + */ +u32 revf_quad_ddr2_shift[] = { + 0, /* 0000b NULL DIMM (128mb) */ + 28, /* 0001b 256mb */ + 29, /* 0010b 512mb */ + 29, /* 0011b 512mb */ + 29, /* 0100b 512mb */ + 30, /* 0101b 1gb */ + 30, /* 0110b 1gb */ + 31, /* 0111b 2gb */ + 31, /* 1000b 2gb */ + 32, /* 1001b 4gb */ + 32, /* 1010b 4gb */ + 33, /* 1011b 8gb */ + 0, /* 1100b future */ + 0, /* 1101b future */ + 0, /* 1110b future */ + 0 /* 1111b future */ +}; + +/* + * Valid scrub rates for the K8 hardware memory scrubber. We map the scrubbing + * bandwidth to a valid bit pattern. The 'set' operation finds the 'matching- + * or higher value'. + * + *FIXME: Produce a better mapping/linearisation. + */ + +struct scrubrate scrubrates[] = { + { 0x01, 1600000000UL}, + { 0x02, 800000000UL}, + { 0x03, 400000000UL}, + { 0x04, 200000000UL}, + { 0x05, 100000000UL}, + { 0x06, 50000000UL}, + { 0x07, 25000000UL}, + { 0x08, 12284069UL}, + { 0x09, 6274509UL}, + { 0x0A, 3121951UL}, + { 0x0B, 1560975UL}, + { 0x0C, 781440UL}, + { 0x0D, 390720UL}, + { 0x0E, 195300UL}, + { 0x0F, 97650UL}, + { 0x10, 48854UL}, + { 0x11, 24427UL}, + { 0x12, 12213UL}, + { 0x13, 6101UL}, + { 0x14, 3051UL}, + { 0x15, 1523UL}, + { 0x16, 761UL}, + { 0x00, 0UL}, /* scrubbing off */ +}; + /* * Memory scrubber control interface. For K8, memory scrubbing is handled by * hardware and can involve L2 cache, dcache as well as the main memory. With @@ -1101,8 +1158,8 @@ static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u32 page, offset; /* Extract the syndrome parts and form a 16-bit syndrome */ - syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8; - syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh); + syndrome = HIGH_SYNDROME(info->nbsl) << 8; + syndrome |= LOW_SYNDROME(info->nbsh); /* CHIPKILL enabled */ if (info->nbcfg & K8_NBCFG_CHIPKILL) { @@ -1701,8 +1758,8 @@ static void f10_map_sysaddr_to_csrow(struct mem_ctl_info *mci, if (csrow >= 0) { error_address_to_page_and_offset(sys_addr, &page, &offset); - syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8; - syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh); + syndrome = HIGH_SYNDROME(info->nbsl) << 8; + syndrome |= LOW_SYNDROME(info->nbsh); /* * Is CHIPKILL on? If so, then we can attempt to use the @@ -2155,36 +2212,22 @@ static int amd64_get_error_info(struct mem_ctl_info *mci, static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci, struct amd64_error_info_regs *info) { - u32 err_code; - u32 ec_tt; /* error code transaction type (2b) */ - u32 ec_ll; /* error code cache level (2b) */ - - err_code = EXTRACT_ERROR_CODE(info->nbsl); - ec_ll = EXTRACT_LL_CODE(err_code); - ec_tt = EXTRACT_TT_CODE(err_code); + u32 ec = ERROR_CODE(info->nbsl); amd64_mc_printk(mci, KERN_ERR, "GART TLB event: transaction type(%s), " - "cache level(%s)\n", tt_msgs[ec_tt], ll_msgs[ec_ll]); + "cache level(%s)\n", TT_MSG(ec), LL_MSG(ec)); } static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci, struct amd64_error_info_regs *info) { - u32 err_code; - u32 ec_rrrr; /* error code memory transaction (4b) */ - u32 ec_tt; /* error code transaction type (2b) */ - u32 ec_ll; /* error code cache level (2b) */ - - err_code = EXTRACT_ERROR_CODE(info->nbsl); - ec_ll = EXTRACT_LL_CODE(err_code); - ec_tt = EXTRACT_TT_CODE(err_code); - ec_rrrr = EXTRACT_RRRR_CODE(err_code); + u32 ec = ERROR_CODE(info->nbsl); amd64_mc_printk(mci, KERN_ERR, "cache hierarchy error: memory transaction type(%s), " "transaction type(%s), cache level(%s)\n", - rrrr_msgs[ec_rrrr], tt_msgs[ec_tt], ll_msgs[ec_ll]); + RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); } @@ -2264,21 +2307,8 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, static void amd64_decode_bus_error(struct mem_ctl_info *mci, struct amd64_error_info_regs *info) { - u32 err_code, ext_ec; - u32 ec_pp; /* error code participating processor (2p) */ - u32 ec_to; /* error code timed out (1b) */ - u32 ec_rrrr; /* error code memory transaction (4b) */ - u32 ec_ii; /* error code memory or I/O (2b) */ - u32 ec_ll; /* error code cache level (2b) */ - - ext_ec = EXTRACT_EXT_ERROR_CODE(info->nbsl); - err_code = EXTRACT_ERROR_CODE(info->nbsl); - - ec_ll = EXTRACT_LL_CODE(err_code); - ec_ii = EXTRACT_II_CODE(err_code); - ec_rrrr = EXTRACT_RRRR_CODE(err_code); - ec_to = EXTRACT_TO_CODE(err_code); - ec_pp = EXTRACT_PP_CODE(err_code); + u32 ec = ERROR_CODE(info->nbsl); + u32 xec = EXT_ERROR_CODE(info->nbsl); amd64_mc_printk(mci, KERN_ERR, "BUS ERROR:\n" @@ -2286,20 +2316,17 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci, " participating processor(%s)\n" " memory transaction type(%s)\n" " cache level(%s) Error Found by: %s\n", - to_msgs[ec_to], - ii_msgs[ec_ii], - pp_msgs[ec_pp], - rrrr_msgs[ec_rrrr], - ll_msgs[ec_ll], + TO_MSG(ec), II_MSG(ec), PP_MSG(ec), RRRR_MSG(ec), LL_MSG(ec), (info->nbsh & K8_NBSH_ERR_SCRUBER) ? "Scrubber" : "Normal Operation"); - /* If this was an 'observed' error, early out */ - if (ec_pp == K8_NBSL_PP_OBS) - return; /* We aren't the node involved */ + + /* Bail early out if this was an 'observed' error */ + if (PP(ec) == K8_NBSL_PP_OBS) + return; /* Parse out the extended error code for ECC events */ - switch (ext_ec) { + switch (xec) { /* F10 changed to one Extended ECC error code */ case F10_NBSL_EXT_ERR_RES: /* Reserved field */ case F10_NBSL_EXT_ERR_ECC: /* F10 ECC ext err code */ @@ -2379,7 +2406,7 @@ int amd64_process_error_info(struct mem_ctl_info *mci, (regs->nbsh & K8_NBSH_CORE3) ? "True" : "False"); - err_code = EXTRACT_ERROR_CODE(regs->nbsl); + err_code = ERROR_CODE(regs->nbsl); /* Determine which error type: * 1) GART errors - non-fatal, developmental events @@ -2387,7 +2414,7 @@ int amd64_process_error_info(struct mem_ctl_info *mci, * 3) BUS errors * 4) Unknown error */ - if (TEST_TLB_ERROR(err_code)) { + if (TLB_ERROR(err_code)) { /* * GART errors are intended to help graphics driver developers * to detect bad GART PTEs. It is recommended by AMD to disable @@ -2411,10 +2438,10 @@ int amd64_process_error_info(struct mem_ctl_info *mci, debugf1("GART TLB error\n"); amd64_decode_gart_tlb_error(mci, info); - } else if (TEST_MEM_ERROR(err_code)) { + } else if (MEM_ERROR(err_code)) { debugf1("Memory/Cache error\n"); amd64_decode_mem_cache_error(mci, info); - } else if (TEST_BUS_ERROR(err_code)) { + } else if (BUS_ERROR(err_code)) { debugf1("Bus (Link/DRAM) error\n"); amd64_decode_bus_error(mci, info); } else { @@ -2424,21 +2451,10 @@ int amd64_process_error_info(struct mem_ctl_info *mci, err_code); } - ext_ec = EXTRACT_EXT_ERROR_CODE(regs->nbsl); + ext_ec = EXT_ERROR_CODE(regs->nbsl); amd64_mc_printk(mci, KERN_ERR, "ExtErr=(0x%x) %s\n", ext_ec, ext_msgs[ext_ec]); - if (((ext_ec >= F10_NBSL_EXT_ERR_CRC && - ext_ec <= F10_NBSL_EXT_ERR_TGT) || - (ext_ec == F10_NBSL_EXT_ERR_RMW)) && - EXTRACT_LDT_LINK(info->nbsh)) { - - amd64_mc_printk(mci, KERN_ERR, - "Error on hypertransport link: %s\n", - htlink_msgs[ - EXTRACT_LDT_LINK(info->nbsh)]); - } - /* * Check the UE bit of the NB status high register, if set generate some * logs. If NOT a GART error, then process the event as a NO-INFO event. diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index ba73015af8e4..1ddef8d15d52 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -72,6 +72,7 @@ #include #include #include "edac_core.h" +#include "edac_mce_amd.h" #define amd64_printk(level, fmt, arg...) \ edac_printk(level, "amd64", fmt, ##arg) @@ -303,9 +304,6 @@ enum { #define K8_NBSL 0x48 -#define EXTRACT_HIGH_SYNDROME(x) (((x) >> 24) & 0xff) -#define EXTRACT_EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) - /* Family F10h: Normalized Extended Error Codes */ #define F10_NBSL_EXT_ERR_RES 0x0 #define F10_NBSL_EXT_ERR_CRC 0x1 @@ -348,17 +346,6 @@ enum { #define K8_NBSL_EXT_ERR_CHIPKILL_ECC 0x8 #define K8_NBSL_EXT_ERR_DRAM_PARITY 0xD -#define EXTRACT_ERROR_CODE(x) ((x) & 0xffff) -#define TEST_TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010) -#define TEST_MEM_ERROR(x) (((x) & 0xFF00) == 0x0100) -#define TEST_BUS_ERROR(x) (((x) & 0xF800) == 0x0800) -#define EXTRACT_TT_CODE(x) (((x) >> 2) & 0x3) -#define EXTRACT_II_CODE(x) (((x) >> 2) & 0x3) -#define EXTRACT_LL_CODE(x) (((x) >> 0) & 0x3) -#define EXTRACT_RRRR_CODE(x) (((x) >> 4) & 0xf) -#define EXTRACT_TO_CODE(x) (((x) >> 8) & 0x1) -#define EXTRACT_PP_CODE(x) (((x) >> 9) & 0x3) - /* * The following are for BUS type errors AFTER values have been normalized by * shifting right @@ -386,9 +373,7 @@ enum { #define K8_NBSH_CORE1 BIT(1) #define K8_NBSH_CORE0 BIT(0) -#define EXTRACT_LDT_LINK(x) (((x) >> 4) & 0x7) #define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF) -#define EXTRACT_LOW_SYNDROME(x) (((x) >> 15) & 0xff) #define K8_NBEAL 0x50 diff --git a/drivers/edac/amd64_edac_err_types.c b/drivers/edac/edac_mce_amd.c similarity index 61% rename from drivers/edac/amd64_edac_err_types.c rename to drivers/edac/edac_mce_amd.c index f212ff12a9d8..cf8465450b32 100644 --- a/drivers/edac/amd64_edac_err_types.c +++ b/drivers/edac/edac_mce_amd.c @@ -1,61 +1,5 @@ -#include "amd64_edac.h" - -/* - * See F2x80 for K8 and F2x[1,0]80 for Fam10 and later. The table below is only - * for DDR2 DRAM mapping. - */ -u32 revf_quad_ddr2_shift[] = { - 0, /* 0000b NULL DIMM (128mb) */ - 28, /* 0001b 256mb */ - 29, /* 0010b 512mb */ - 29, /* 0011b 512mb */ - 29, /* 0100b 512mb */ - 30, /* 0101b 1gb */ - 30, /* 0110b 1gb */ - 31, /* 0111b 2gb */ - 31, /* 1000b 2gb */ - 32, /* 1001b 4gb */ - 32, /* 1010b 4gb */ - 33, /* 1011b 8gb */ - 0, /* 1100b future */ - 0, /* 1101b future */ - 0, /* 1110b future */ - 0 /* 1111b future */ -}; - -/* - * Valid scrub rates for the K8 hardware memory scrubber. We map the scrubbing - * bandwidth to a valid bit pattern. The 'set' operation finds the 'matching- - * or higher value'. - * - *FIXME: Produce a better mapping/linearisation. - */ - -struct scrubrate scrubrates[] = { - { 0x01, 1600000000UL}, - { 0x02, 800000000UL}, - { 0x03, 400000000UL}, - { 0x04, 200000000UL}, - { 0x05, 100000000UL}, - { 0x06, 50000000UL}, - { 0x07, 25000000UL}, - { 0x08, 12284069UL}, - { 0x09, 6274509UL}, - { 0x0A, 3121951UL}, - { 0x0B, 1560975UL}, - { 0x0C, 781440UL}, - { 0x0D, 390720UL}, - { 0x0E, 195300UL}, - { 0x0F, 97650UL}, - { 0x10, 48854UL}, - { 0x11, 24427UL}, - { 0x12, 12213UL}, - { 0x13, 6101UL}, - { 0x14, 3051UL}, - { 0x15, 1523UL}, - { 0x16, 761UL}, - { 0x00, 0UL}, /* scrubbing off */ -}; +#include +#include "edac_mce_amd.h" /* * string representation for the different MCA reported error types, see F3x48 @@ -67,6 +11,7 @@ const char *tt_msgs[] = { /* transaction type */ "generic", "reserved" }; +EXPORT_SYMBOL_GPL(tt_msgs); const char *ll_msgs[] = { /* cache level */ "L0", @@ -74,6 +19,7 @@ const char *ll_msgs[] = { /* cache level */ "L2", "L3/generic" }; +EXPORT_SYMBOL_GPL(ll_msgs); const char *rrrr_msgs[] = { "generic", @@ -93,6 +39,7 @@ const char *rrrr_msgs[] = { "reserved RRRR= 14", "reserved RRRR= 15" }; +EXPORT_SYMBOL_GPL(rrrr_msgs); const char *pp_msgs[] = { /* participating processor */ "local node originated (SRC)", @@ -100,11 +47,13 @@ const char *pp_msgs[] = { /* participating processor */ "local node observed as 3rd party (OBS)", "generic" }; +EXPORT_SYMBOL_GPL(pp_msgs); const char *to_msgs[] = { "no timeout", "timed out" }; +EXPORT_SYMBOL_GPL(to_msgs); const char *ii_msgs[] = { /* memory or i/o */ "mem access", @@ -112,6 +61,7 @@ const char *ii_msgs[] = { /* memory or i/o */ "i/o access", "generic" }; +EXPORT_SYMBOL_GPL(ii_msgs); /* Map the 5 bits of Extended Error code to the string table. */ const char *ext_msgs[] = { /* extended error */ @@ -148,14 +98,4 @@ const char *ext_msgs[] = { /* extended error */ "L3 Cache LRU error", /* 1_1110b */ "Res 0x1FF error" /* 1_1111b */ }; - -const char *htlink_msgs[] = { - "none", - "1", - "2", - "1 2", - "3", - "1 3", - "2 3", - "1 2 3" -}; +EXPORT_SYMBOL_GPL(ext_msgs); diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h new file mode 100644 index 000000000000..81f9dcf9990a --- /dev/null +++ b/drivers/edac/edac_mce_amd.h @@ -0,0 +1,29 @@ +#define ERROR_CODE(x) ((x) & 0xffff) +#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) +#define LOW_SYNDROME(x) (((x) >> 15) & 0xff) +#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff) + +#define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010) +#define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100) +#define BUS_ERROR(x) (((x) & 0xF800) == 0x0800) + +#define TT(x) (((x) >> 2) & 0x3) +#define TT_MSG(x) tt_msgs[TT(x)] +#define II(x) (((x) >> 2) & 0x3) +#define II_MSG(x) ii_msgs[II(x)] +#define LL(x) (((x) >> 0) & 0x3) +#define LL_MSG(x) ll_msgs[LL(x)] +#define RRRR(x) (((x) >> 4) & 0xf) +#define RRRR_MSG(x) rrrr_msgs[RRRR(x)] +#define TO(x) (((x) >> 8) & 0x1) +#define TO_MSG(x) to_msgs[TO(x)] +#define PP(x) (((x) >> 9) & 0x3) +#define PP_MSG(x) pp_msgs[PP(x)] + +extern const char *tt_msgs[]; +extern const char *ll_msgs[]; +extern const char *rrrr_msgs[]; +extern const char *pp_msgs[]; +extern const char *to_msgs[]; +extern const char *ii_msgs[]; +extern const char *ext_msgs[]; -- GitLab