diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c index 8d9c2d232d2a19a4cf6f8cf3cd683ef2bf8835a3..a3eebd193dfe085f3e880b936f28577b3667f4b4 100644 --- a/arch/powerpc/platforms/powernv/eeh-ioda.c +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -34,6 +34,15 @@ #include "powernv.h" #include "pci.h" +/* Debugging option */ +#ifdef IODA_EEH_DBG_ON +#define IODA_EEH_DBG(args...) pr_info(args) +#else +#define IODA_EEH_DBG(args...) +#endif + +static char *hub_diag = NULL; + /** * ioda_eeh_post_init - Chip dependent post initialization * @hose: PCI controller @@ -47,8 +56,19 @@ static int ioda_eeh_post_init(struct pci_controller *hose) struct pnv_phb *phb = hose->private_data; /* FIXME: Enable it for PHB3 later */ - if (phb->type == PNV_PHB_IODA1) + if (phb->type == PNV_PHB_IODA1) { + if (!hub_diag) { + hub_diag = (char *)__get_free_page(GFP_KERNEL | + __GFP_ZERO); + if (!hub_diag) { + pr_err("%s: Out of memory !\n", + __func__); + return -ENOMEM; + } + } + phb->eeh_enabled = 1; + } return 0; } @@ -498,6 +518,316 @@ static int ioda_eeh_configure_bridge(struct eeh_pe *pe) return 0; } +static void ioda_eeh_hub_diag_common(struct OpalIoP7IOCErrorData *data) +{ + /* GEM */ + pr_info(" GEM XFIR: %016llx\n", data->gemXfir); + pr_info(" GEM RFIR: %016llx\n", data->gemRfir); + pr_info(" GEM RIRQFIR: %016llx\n", data->gemRirqfir); + pr_info(" GEM Mask: %016llx\n", data->gemMask); + pr_info(" GEM RWOF: %016llx\n", data->gemRwof); + + /* LEM */ + pr_info(" LEM FIR: %016llx\n", data->lemFir); + pr_info(" LEM Error Mask: %016llx\n", data->lemErrMask); + pr_info(" LEM Action 0: %016llx\n", data->lemAction0); + pr_info(" LEM Action 1: %016llx\n", data->lemAction1); + pr_info(" LEM WOF: %016llx\n", data->lemWof); +} + +static void ioda_eeh_hub_diag(struct pci_controller *hose) +{ + struct pnv_phb *phb = hose->private_data; + struct OpalIoP7IOCErrorData *data; + long rc; + + data = (struct OpalIoP7IOCErrorData *)ioda_eeh_hub_diag; + rc = opal_pci_get_hub_diag_data(phb->hub_id, data, PAGE_SIZE); + if (rc != OPAL_SUCCESS) { + pr_warning("%s: Failed to get HUB#%llx diag-data (%ld)\n", + __func__, phb->hub_id, rc); + return; + } + + switch (data->type) { + case OPAL_P7IOC_DIAG_TYPE_RGC: + pr_info("P7IOC diag-data for RGC\n\n"); + ioda_eeh_hub_diag_common(data); + pr_info(" RGC Status: %016llx\n", data->rgc.rgcStatus); + pr_info(" RGC LDCP: %016llx\n", data->rgc.rgcLdcp); + break; + case OPAL_P7IOC_DIAG_TYPE_BI: + pr_info("P7IOC diag-data for BI %s\n\n", + data->bi.biDownbound ? "Downbound" : "Upbound"); + ioda_eeh_hub_diag_common(data); + pr_info(" BI LDCP 0: %016llx\n", data->bi.biLdcp0); + pr_info(" BI LDCP 1: %016llx\n", data->bi.biLdcp1); + pr_info(" BI LDCP 2: %016llx\n", data->bi.biLdcp2); + pr_info(" BI Fence Status: %016llx\n", data->bi.biFenceStatus); + break; + case OPAL_P7IOC_DIAG_TYPE_CI: + pr_info("P7IOC diag-data for CI Port %d\\nn", + data->ci.ciPort); + ioda_eeh_hub_diag_common(data); + pr_info(" CI Port Status: %016llx\n", data->ci.ciPortStatus); + pr_info(" CI Port LDCP: %016llx\n", data->ci.ciPortLdcp); + break; + case OPAL_P7IOC_DIAG_TYPE_MISC: + pr_info("P7IOC diag-data for MISC\n\n"); + ioda_eeh_hub_diag_common(data); + break; + case OPAL_P7IOC_DIAG_TYPE_I2C: + pr_info("P7IOC diag-data for I2C\n\n"); + ioda_eeh_hub_diag_common(data); + break; + default: + pr_warning("%s: Invalid type of HUB#%llx diag-data (%d)\n", + __func__, phb->hub_id, data->type); + } +} + +static void ioda_eeh_p7ioc_phb_diag(struct pci_controller *hose, + struct OpalIoPhbErrorCommon *common) +{ + struct OpalIoP7IOCPhbErrorData *data; + int i; + + data = (struct OpalIoP7IOCPhbErrorData *)common; + + pr_info("P7IOC PHB#%x Diag-data (Version: %d)\n\n", + hose->global_number, common->version); + + pr_info(" brdgCtl: %08x\n", data->brdgCtl); + + pr_info(" portStatusReg: %08x\n", data->portStatusReg); + pr_info(" rootCmplxStatus: %08x\n", data->rootCmplxStatus); + pr_info(" busAgentStatus: %08x\n", data->busAgentStatus); + + pr_info(" deviceStatus: %08x\n", data->deviceStatus); + pr_info(" slotStatus: %08x\n", data->slotStatus); + pr_info(" linkStatus: %08x\n", data->linkStatus); + pr_info(" devCmdStatus: %08x\n", data->devCmdStatus); + pr_info(" devSecStatus: %08x\n", data->devSecStatus); + + pr_info(" rootErrorStatus: %08x\n", data->rootErrorStatus); + pr_info(" uncorrErrorStatus: %08x\n", data->uncorrErrorStatus); + pr_info(" corrErrorStatus: %08x\n", data->corrErrorStatus); + pr_info(" tlpHdr1: %08x\n", data->tlpHdr1); + pr_info(" tlpHdr2: %08x\n", data->tlpHdr2); + pr_info(" tlpHdr3: %08x\n", data->tlpHdr3); + pr_info(" tlpHdr4: %08x\n", data->tlpHdr4); + pr_info(" sourceId: %08x\n", data->sourceId); + + pr_info(" errorClass: %016llx\n", data->errorClass); + pr_info(" correlator: %016llx\n", data->correlator); + pr_info(" p7iocPlssr: %016llx\n", data->p7iocPlssr); + pr_info(" p7iocCsr: %016llx\n", data->p7iocCsr); + pr_info(" lemFir: %016llx\n", data->lemFir); + pr_info(" lemErrorMask: %016llx\n", data->lemErrorMask); + pr_info(" lemWOF: %016llx\n", data->lemWOF); + pr_info(" phbErrorStatus: %016llx\n", data->phbErrorStatus); + pr_info(" phbFirstErrorStatus: %016llx\n", data->phbFirstErrorStatus); + pr_info(" phbErrorLog0: %016llx\n", data->phbErrorLog0); + pr_info(" phbErrorLog1: %016llx\n", data->phbErrorLog1); + pr_info(" mmioErrorStatus: %016llx\n", data->mmioErrorStatus); + pr_info(" mmioFirstErrorStatus: %016llx\n", data->mmioFirstErrorStatus); + pr_info(" mmioErrorLog0: %016llx\n", data->mmioErrorLog0); + pr_info(" mmioErrorLog1: %016llx\n", data->mmioErrorLog1); + pr_info(" dma0ErrorStatus: %016llx\n", data->dma0ErrorStatus); + pr_info(" dma0FirstErrorStatus: %016llx\n", data->dma0FirstErrorStatus); + pr_info(" dma0ErrorLog0: %016llx\n", data->dma0ErrorLog0); + pr_info(" dma0ErrorLog1: %016llx\n", data->dma0ErrorLog1); + pr_info(" dma1ErrorStatus: %016llx\n", data->dma1ErrorStatus); + pr_info(" dma1FirstErrorStatus: %016llx\n", data->dma1FirstErrorStatus); + pr_info(" dma1ErrorLog0: %016llx\n", data->dma1ErrorLog0); + pr_info(" dma1ErrorLog1: %016llx\n", data->dma1ErrorLog1); + + for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) { + if ((data->pestA[i] >> 63) == 0 && + (data->pestB[i] >> 63) == 0) + continue; + + pr_info(" PE[%3d] PESTA: %016llx\n", i, data->pestA[i]); + pr_info(" PESTB: %016llx\n", data->pestB[i]); + } +} + +static void ioda_eeh_phb_diag(struct pci_controller *hose) +{ + struct pnv_phb *phb = hose->private_data; + struct OpalIoPhbErrorCommon *common; + long rc; + + common = (struct OpalIoPhbErrorCommon *)phb->diag.blob; + rc = opal_pci_get_phb_diag_data2(phb->opal_id, common, PAGE_SIZE); + if (rc != OPAL_SUCCESS) { + pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n", + __func__, hose->global_number, rc); + return; + } + + switch (common->ioType) { + case OPAL_PHB_ERROR_DATA_TYPE_P7IOC: + ioda_eeh_p7ioc_phb_diag(hose, common); + break; + default: + pr_warning("%s: Unrecognized I/O chip %d\n", + __func__, common->ioType); + } +} + +static int ioda_eeh_get_phb_pe(struct pci_controller *hose, + struct eeh_pe **pe) +{ + struct eeh_pe *phb_pe; + + phb_pe = eeh_phb_pe_get(hose); + if (!phb_pe) { + pr_warning("%s Can't find PE for PHB#%d\n", + __func__, hose->global_number); + return -EEXIST; + } + + *pe = phb_pe; + return 0; +} + +static int ioda_eeh_get_pe(struct pci_controller *hose, + u16 pe_no, struct eeh_pe **pe) +{ + struct eeh_pe *phb_pe, *dev_pe; + struct eeh_dev dev; + + /* Find the PHB PE */ + if (ioda_eeh_get_phb_pe(hose, &phb_pe)) + return -EEXIST; + + /* Find the PE according to PE# */ + memset(&dev, 0, sizeof(struct eeh_dev)); + dev.phb = hose; + dev.pe_config_addr = pe_no; + dev_pe = eeh_pe_get(&dev); + if (!dev_pe) { + pr_warning("%s: Can't find PE for PHB#%x - PE#%x\n", + __func__, hose->global_number, pe_no); + return -EEXIST; + } + + *pe = dev_pe; + return 0; +} + +/** + * ioda_eeh_next_error - Retrieve next error for EEH core to handle + * @pe: The affected PE + * + * The function is expected to be called by EEH core while it gets + * special EEH event (without binding PE). The function calls to + * OPAL APIs for next error to handle. The informational error is + * handled internally by platform. However, the dead IOC, dead PHB, + * fenced PHB and frozen PE should be handled by EEH core eventually. + */ +static int ioda_eeh_next_error(struct eeh_pe **pe) +{ + struct pci_controller *hose, *tmp; + struct pnv_phb *phb; + u64 frozen_pe_no; + u16 err_type, severity; + long rc; + int ret = 1; + + /* While running here, it's safe to purge the event queue */ + eeh_remove_event(NULL); + + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + /* + * If the subordinate PCI buses of the PHB has been + * removed, we needn't take care of it any more. + */ + phb = hose->private_data; + if (phb->removed) + continue; + + rc = opal_pci_next_error(phb->opal_id, + &frozen_pe_no, &err_type, &severity); + + /* If OPAL API returns error, we needn't proceed */ + if (rc != OPAL_SUCCESS) { + IODA_EEH_DBG("%s: Invalid return value on " + "PHB#%x (0x%lx) from opal_pci_next_error", + __func__, hose->global_number, rc); + continue; + } + + /* If the PHB doesn't have error, stop processing */ + if (err_type == OPAL_EEH_NO_ERROR || + severity == OPAL_EEH_SEV_NO_ERROR) { + IODA_EEH_DBG("%s: No error found on PHB#%x\n", + __func__, hose->global_number); + continue; + } + + /* + * Processing the error. We're expecting the error with + * highest priority reported upon multiple errors on the + * specific PHB. + */ + IODA_EEH_DBG("%s: Error (%d, %d, %d) on PHB#%x\n", + err_type, severity, pe_no, hose->global_number); + switch (err_type) { + case OPAL_EEH_IOC_ERROR: + if (severity == OPAL_EEH_SEV_IOC_DEAD) { + list_for_each_entry_safe(hose, tmp, + &hose_list, list_node) { + phb = hose->private_data; + phb->removed = 1; + } + + WARN(1, "EEH: dead IOC detected\n"); + ret = 4; + goto out; + } else if (severity == OPAL_EEH_SEV_INF) + ioda_eeh_hub_diag(hose); + + break; + case OPAL_EEH_PHB_ERROR: + if (severity == OPAL_EEH_SEV_PHB_DEAD) { + if (ioda_eeh_get_phb_pe(hose, pe)) + break; + + WARN(1, "EEH: dead PHB#%x detected\n", + hose->global_number); + phb->removed = 1; + ret = 3; + goto out; + } else if (severity == OPAL_EEH_SEV_PHB_FENCED) { + if (ioda_eeh_get_phb_pe(hose, pe)) + break; + + WARN(1, "EEH: fenced PHB#%x detected\n", + hose->global_number); + ret = 2; + goto out; + } else if (severity == OPAL_EEH_SEV_INF) + ioda_eeh_phb_diag(hose); + + break; + case OPAL_EEH_PE_ERROR: + if (ioda_eeh_get_pe(hose, frozen_pe_no, pe)) + break; + + WARN(1, "EEH: Frozen PE#%x on PHB#%x detected\n", + (*pe)->addr, (*pe)->phb->global_number); + ret = 1; + goto out; + } + } + + ret = 0; +out: + return ret; +} + struct pnv_eeh_ops ioda_eeh_ops = { .post_init = ioda_eeh_post_init, .set_option = ioda_eeh_set_option, @@ -505,5 +835,5 @@ struct pnv_eeh_ops ioda_eeh_ops = { .reset = ioda_eeh_reset, .get_log = ioda_eeh_get_log, .configure_bridge = ioda_eeh_configure_bridge, - .next_error = NULL + .next_error = ioda_eeh_next_error }; diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 336c9dc1b314f424591d762c92ab013b46a9162a..3656a2409e9a9703b839e4b2d3a7b84fd42b5b79 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -93,6 +93,7 @@ struct pnv_phb { #ifdef CONFIG_EEH struct pnv_eeh_ops *eeh_ops; int eeh_enabled; + int removed; #endif #ifdef CONFIG_PCI_MSI