mce.c 3.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/*
 * NFIT - Machine Check Handler
 *
 * Copyright(c) 2013-2016 Intel Corporation. All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of version 2 of the GNU General Public License as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 */
#include <linux/notifier.h>
#include <linux/acpi.h>
17
#include <linux/nd.h>
18 19 20 21 22 23 24 25 26 27
#include <asm/mce.h>
#include "nfit.h"

static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,
			void *data)
{
	struct mce *mce = (struct mce *)data;
	struct acpi_nfit_desc *acpi_desc;
	struct nfit_spa *nfit_spa;

28 29
	/* We only care about uncorrectable memory errors */
	if (!mce_is_memory_error(mce) || mce_is_correctable(mce))
30 31
		return NOTIFY_DONE;

32 33 34 35
	/* Verify the address reported in the MCE is valid. */
	if (!mce_usable_address(mce))
		return NOTIFY_DONE;

36 37 38 39 40 41 42 43 44 45 46 47 48 49
	/*
	 * mce->addr contains the physical addr accessed that caused the
	 * machine check. We need to walk through the list of NFITs, and see
	 * if any of them matches that address, and only then start a scrub.
	 */
	mutex_lock(&acpi_desc_lock);
	list_for_each_entry(acpi_desc, &acpi_descs, list) {
		struct device *dev = acpi_desc->dev;
		int found_match = 0;

		mutex_lock(&acpi_desc->init_mutex);
		list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
			struct acpi_nfit_system_address *spa = nfit_spa->spa;

50
			if (nfit_spa_type(spa) != NFIT_SPA_PM)
51 52 53 54 55 56 57
				continue;
			/* find the spa that covers the mce addr */
			if (spa->address > mce->addr)
				continue;
			if ((spa->address + spa->length - 1) < mce->addr)
				continue;
			found_match = 1;
58 59
			dev_dbg(dev, "addr in SPA %d (0x%llx, 0x%llx)\n",
				spa->range_index, spa->address, spa->length);
60 61 62 63 64 65 66 67 68
			/*
			 * We can break at the first match because we're going
			 * to rescan all the SPA ranges. There shouldn't be any
			 * aliasing anyway.
			 */
			break;
		}
		mutex_unlock(&acpi_desc->init_mutex);

69 70 71 72
		if (!found_match)
			continue;

		/* If this fails due to an -ENOMEM, there is little we can do */
73
		nvdimm_bus_add_badrange(acpi_desc->nvdimm_bus,
74 75 76 77 78 79 80 81 82 83 84
				ALIGN(mce->addr, L1_CACHE_BYTES),
				L1_CACHE_BYTES);
		nvdimm_region_notify(nfit_spa->nd_region,
				NVDIMM_REVALIDATE_POISON);

		if (acpi_desc->scrub_mode == HW_ERROR_SCRUB_ON) {
			/*
			 * We can ignore an -EBUSY here because if an ARS is
			 * already in progress, just let that be the last
			 * authoritative one
			 */
85
			acpi_nfit_ars_rescan(acpi_desc, 0);
86
		}
87
		mce->kflags |= MCE_HANDLED_NFIT;
88
		break;
89 90 91 92 93 94 95 96
	}

	mutex_unlock(&acpi_desc_lock);
	return NOTIFY_DONE;
}

static struct notifier_block nfit_mce_dec = {
	.notifier_call	= nfit_handle_mce,
97
	.priority	= MCE_PRIO_NFIT,
98 99 100 101 102 103 104 105 106 107 108
};

void nfit_mce_register(void)
{
	mce_register_decode_chain(&nfit_mce_dec);
}

void nfit_mce_unregister(void)
{
	mce_unregister_decode_chain(&nfit_mce_dec);
}