提交 cf870c70 编写于 作者: N Naveen N. Rao 提交者: Tony Luck

mce: acpi/apei: Soft-offline a page on firmware GHES notification

If the firmware indicates in GHES error data entry that the error threshold
has exceeded for a corrected error event, then we try to soft-offline the
page. This could be called in interrupt context, so we queue this up similar
to how we handle memory failure scenarios.
Signed-off-by: NNaveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Acked-by: NBorislav Petkov <bp@suse.de>
Signed-off-by: NTony Luck <tony.luck@intel.com>
上级 9ad95879
......@@ -409,6 +409,34 @@ static void ghes_clear_estatus(struct ghes *ghes)
ghes->flags &= ~GHES_TO_CLEAR;
}
static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int sev)
{
#ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE
unsigned long pfn;
int sec_sev = ghes_severity(gdata->error_severity);
struct cper_sec_mem_err *mem_err;
mem_err = (struct cper_sec_mem_err *)(gdata + 1);
if (sec_sev == GHES_SEV_CORRECTED &&
(gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED) &&
(mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS)) {
pfn = mem_err->physical_addr >> PAGE_SHIFT;
if (pfn_valid(pfn))
memory_failure_queue(pfn, 0, MF_SOFT_OFFLINE);
else if (printk_ratelimit())
pr_warn(FW_WARN GHES_PFX
"Invalid address in generic error data: %#llx\n",
mem_err->physical_addr);
}
if (sev == GHES_SEV_RECOVERABLE &&
sec_sev == GHES_SEV_RECOVERABLE &&
mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) {
pfn = mem_err->physical_addr >> PAGE_SHIFT;
memory_failure_queue(pfn, 0, 0);
}
#endif
}
static void ghes_do_proc(struct ghes *ghes,
const struct acpi_hest_generic_status *estatus)
{
......@@ -428,15 +456,7 @@ static void ghes_do_proc(struct ghes *ghes,
apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED,
mem_err);
#endif
#ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE
if (sev == GHES_SEV_RECOVERABLE &&
sec_sev == GHES_SEV_RECOVERABLE &&
mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) {
unsigned long pfn;
pfn = mem_err->physical_addr >> PAGE_SHIFT;
memory_failure_queue(pfn, 0, 0);
}
#endif
ghes_handle_memory_failure(gdata, sev);
}
#ifdef CONFIG_ACPI_APEI_PCIEAER
else if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
......
......@@ -1784,6 +1784,7 @@ enum mf_flags {
MF_COUNT_INCREASED = 1 << 0,
MF_ACTION_REQUIRED = 1 << 1,
MF_MUST_KILL = 1 << 2,
MF_SOFT_OFFLINE = 1 << 3,
};
extern int memory_failure(unsigned long pfn, int trapno, int flags);
extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
......
......@@ -1286,7 +1286,10 @@ static void memory_failure_work_func(struct work_struct *work)
spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
if (!gotten)
break;
memory_failure(entry.pfn, entry.trapno, entry.flags);
if (entry.flags & MF_SOFT_OFFLINE)
soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
else
memory_failure(entry.pfn, entry.trapno, entry.flags);
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册