diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 2cb1cc253d51ebb5828e89f63b63462396050970..fc62ba8dce933d3bb1ec0262b7d4851bcc5fee26 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -15,6 +15,7 @@ struct machine_ops { }; extern struct machine_ops machine_ops; +extern int crashing_cpu; void native_machine_crash_shutdown(struct pt_regs *regs); void native_machine_shutdown(void); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 8e9725c607ea6acb7a91deed9b72b2c9a873803e..177472ace83854c314810e4e32d9944cf5ebd227 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -49,6 +49,7 @@ #include #include #include +#include #include "mce-internal.h" @@ -1127,9 +1128,22 @@ void do_machine_check(struct pt_regs *regs, long error_code) * on Intel. */ int lmce = 1; + int cpu = smp_processor_id(); - /* If this CPU is offline, just bail out. */ - if (cpu_is_offline(smp_processor_id())) { + /* + * Cases where we avoid rendezvous handler timeout: + * 1) If this CPU is offline. + * + * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to + * skip those CPUs which remain looping in the 1st kernel - see + * crash_nmi_callback(). + * + * Note: there still is a small window between kexec-ing and the new, + * kdump kernel establishing a new #MC handler where a broadcasted MCE + * might not get handled properly. + */ + if (cpu_is_offline(cpu) || + (crashing_cpu != -1 && crashing_cpu != cpu)) { u64 mcgstatus; mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 067f9813fd2cf7c15d5a1d297b537eedf6ca7959..2544700a2a87566437e3aea0d8b3696e7e0f439e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -765,10 +765,11 @@ void machine_crash_shutdown(struct pt_regs *regs) #endif +/* This is the CPU performing the emergency shutdown work. */ +int crashing_cpu = -1; + #if defined(CONFIG_SMP) -/* This keeps a track of which one is crashing cpu. */ -static int crashing_cpu; static nmi_shootdown_cb shootdown_callback; static atomic_t waiting_for_crash_ipi;