diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h index f06ce9ab692c38eb09f6203d9fc44747b7c6d720..740c7e22c0234b102f2c7d466ea1402ff04c32cf 100644 --- a/drivers/edac/edac_core.h +++ b/drivers/edac/edac_core.h @@ -463,12 +463,12 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, const unsigned long page_frame_number, const unsigned long offset_in_page, const unsigned long syndrome, - const int layer0, - const int layer1, - const int layer2, + const int top_layer, + const int mid_layer, + const int low_layer, const char *msg, const char *other_detail, - const void *mcelog); + const void *arch_log); /* * edac_device APIs diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 10f375032e9686f7c719c857ceca5dd3f9ef81ed..ce25750a83f904cdf2c4a7306ba5eb33527a0059 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -27,12 +27,17 @@ #include #include #include +#include #include #include #include #include "edac_core.h" #include "edac_module.h" +#define CREATE_TRACE_POINTS +#define TRACE_INCLUDE_PATH ../../include/ras +#include + /* lock to memory controller's control array */ static DEFINE_MUTEX(mem_ctls_mutex); static LIST_HEAD(mc_devices); @@ -384,6 +389,7 @@ struct mem_ctl_info *edac_mc_alloc(unsigned mc_num, * which will perform kobj unregistration and the actual free * will occur during the kobject callback operation */ + return mci; } EXPORT_SYMBOL_GPL(edac_mc_alloc); @@ -902,19 +908,19 @@ static void edac_ce_error(struct mem_ctl_info *mci, const bool enable_per_layer_report, const unsigned long page_frame_number, const unsigned long offset_in_page, - u32 grain) + long grain) { unsigned long remapped_page; if (edac_mc_get_log_ce()) { if (other_detail && *other_detail) edac_mc_printk(mci, KERN_WARNING, - "CE %s on %s (%s%s - %s)\n", + "CE %s on %s (%s %s - %s)\n", msg, label, location, detail, other_detail); else edac_mc_printk(mci, KERN_WARNING, - "CE %s on %s (%s%s)\n", + "CE %s on %s (%s %s)\n", msg, label, location, detail); } @@ -953,12 +959,12 @@ static void edac_ue_error(struct mem_ctl_info *mci, if (edac_mc_get_log_ue()) { if (other_detail && *other_detail) edac_mc_printk(mci, KERN_WARNING, - "UE %s on %s (%s%s - %s)\n", + "UE %s on %s (%s %s - %s)\n", msg, label, location, detail, other_detail); else edac_mc_printk(mci, KERN_WARNING, - "UE %s on %s (%s%s)\n", + "UE %s on %s (%s %s)\n", msg, label, location, detail); } @@ -975,27 +981,50 @@ static void edac_ue_error(struct mem_ctl_info *mci, } #define OTHER_LABEL " or " + +/** + * edac_mc_handle_error - reports a memory event to userspace + * + * @type: severity of the error (CE/UE/Fatal) + * @mci: a struct mem_ctl_info pointer + * @page_frame_number: mem page where the error occurred + * @offset_in_page: offset of the error inside the page + * @syndrome: ECC syndrome + * @top_layer: Memory layer[0] position + * @mid_layer: Memory layer[1] position + * @low_layer: Memory layer[2] position + * @msg: Message meaningful to the end users that + * explains the event + * @other_detail: Technical details about the event that + * may help hardware manufacturers and + * EDAC developers to analyse the event + * @arch_log: Architecture-specific struct that can + * be used to add extended information to the + * tracepoint, like dumping MCE registers. + */ void edac_mc_handle_error(const enum hw_event_mc_err_type type, struct mem_ctl_info *mci, const unsigned long page_frame_number, const unsigned long offset_in_page, const unsigned long syndrome, - const int layer0, - const int layer1, - const int layer2, + const int top_layer, + const int mid_layer, + const int low_layer, const char *msg, const char *other_detail, - const void *mcelog) + const void *arch_log) { /* FIXME: too much for stack: move it to some pre-alocated area */ char detail[80], location[80]; char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * mci->tot_dimms]; char *p; int row = -1, chan = -1; - int pos[EDAC_MAX_LAYERS] = { layer0, layer1, layer2 }; + int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer }; int i; - u32 grain; + long grain; bool enable_per_layer_report = false; + u16 error_count; /* FIXME: make it a parameter */ + u8 grain_bits; debugf3("MC%d: %s()\n", mci->mc_idx, __func__); @@ -1045,11 +1074,11 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, for (i = 0; i < mci->tot_dimms; i++) { struct dimm_info *dimm = &mci->dimms[i]; - if (layer0 >= 0 && layer0 != dimm->location[0]) + if (top_layer >= 0 && top_layer != dimm->location[0]) continue; - if (layer1 >= 0 && layer1 != dimm->location[1]) + if (mid_layer >= 0 && mid_layer != dimm->location[1]) continue; - if (layer2 >= 0 && layer2 != dimm->location[2]) + if (low_layer >= 0 && low_layer != dimm->location[2]) continue; /* get the max grain, over the error match range */ @@ -1120,11 +1149,22 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, edac_layer_name[mci->layers[i].type], pos[i]); } + if (p > location) + *(p - 1) = '\0'; + + /* Report the error via the trace interface */ + + error_count = 1; /* FIXME: allow change it */ + grain_bits = fls_long(grain) + 1; + trace_mc_event(type, msg, label, error_count, + mci->mc_idx, top_layer, mid_layer, low_layer, + PAGES_TO_MiB(page_frame_number) | offset_in_page, + grain_bits, syndrome, other_detail); /* Memory type dependent details about the error */ if (type == HW_EVENT_ERR_CORRECTED) { snprintf(detail, sizeof(detail), - "page:0x%lx offset:0x%lx grain:%d syndrome:0x%lx", + "page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx", page_frame_number, offset_in_page, grain, syndrome); edac_ce_error(mci, pos, msg, location, label, detail, @@ -1132,7 +1172,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, page_frame_number, offset_in_page, grain); } else { snprintf(detail, sizeof(detail), - "page:0x%lx offset:0x%lx grain:%d", + "page:0x%lx offset:0x%lx grain:%ld", page_frame_number, offset_in_page, grain); edac_ue_error(mci, pos, msg, location, label, detail, diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h new file mode 100644 index 0000000000000000000000000000000000000000..260470e72483071ee1843345e7969817e5b2571a --- /dev/null +++ b/include/ras/ras_event.h @@ -0,0 +1,102 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ras +#define TRACE_INCLUDE_FILE ras_event + +#if !defined(_TRACE_HW_EVENT_MC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HW_EVENT_MC_H + +#include +#include +#include + +/* + * Hardware Events Report + * + * Those events are generated when hardware detected a corrected or + * uncorrected event, and are meant to replace the current API to report + * errors defined on both EDAC and MCE subsystems. + * + * FIXME: Add events for handling memory errors originated from the + * MCE subsystem. + */ + +/* + * Hardware-independent Memory Controller specific events + */ + +/* + * Default error mechanisms for Memory Controller errors (CE and UE) + */ +TRACE_EVENT(mc_event, + + TP_PROTO(const unsigned int err_type, + const char *error_msg, + const char *label, + const int error_count, + const u8 mc_index, + const s8 top_layer, + const s8 mid_layer, + const s8 low_layer, + unsigned long address, + const u8 grain_bits, + unsigned long syndrome, + const char *driver_detail), + + TP_ARGS(err_type, error_msg, label, error_count, mc_index, + top_layer, mid_layer, low_layer, address, grain_bits, + syndrome, driver_detail), + + TP_STRUCT__entry( + __field( unsigned int, error_type ) + __string( msg, error_msg ) + __string( label, label ) + __field( u16, error_count ) + __field( u8, mc_index ) + __field( s8, top_layer ) + __field( s8, middle_layer ) + __field( s8, lower_layer ) + __field( long, address ) + __field( u8, grain_bits ) + __field( long, syndrome ) + __string( driver_detail, driver_detail ) + ), + + TP_fast_assign( + __entry->error_type = err_type; + __assign_str(msg, error_msg); + __assign_str(label, label); + __entry->error_count = error_count; + __entry->mc_index = mc_index; + __entry->top_layer = top_layer; + __entry->middle_layer = mid_layer; + __entry->lower_layer = low_layer; + __entry->address = address; + __entry->grain_bits = grain_bits; + __entry->syndrome = syndrome; + __assign_str(driver_detail, driver_detail); + ), + + TP_printk("%d %s error%s:%s%s on %s (mc:%d location:%d:%d:%d address:0x%08lx grain:%d syndrome:0x%08lx%s%s)", + __entry->error_count, + (__entry->error_type == HW_EVENT_ERR_CORRECTED) ? "Corrected" : + ((__entry->error_type == HW_EVENT_ERR_FATAL) ? + "Fatal" : "Uncorrected"), + __entry->error_count > 1 ? "s" : "", + ((char *)__get_str(msg))[0] ? " " : "", + __get_str(msg), + __get_str(label), + __entry->mc_index, + __entry->top_layer, + __entry->middle_layer, + __entry->lower_layer, + __entry->address, + 1 << __entry->grain_bits, + __entry->syndrome, + ((char *)__get_str(driver_detail))[0] ? " " : "", + __get_str(driver_detail)) +); + +#endif /* _TRACE_HW_EVENT_MC_H */ + +/* This part must be outside protection */ +#include