srat.c 8.1 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 * ACPI 3.0 based NUMA setup
 * Copyright 2004 Andi Kleen, SuSE Labs.
 *
 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
 *
 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
 * Assumes all memory regions belonging to a single proximity domain
 * are in one chunk. Holes between them will be included in the node.
 */

#include <linux/kernel.h>
#include <linux/acpi.h>
#include <linux/mmzone.h>
#include <linux/bitmap.h>
#include <linux/module.h>
#include <linux/topology.h>
18
#include <linux/bootmem.h>
19
#include <linux/memblock.h>
20
#include <linux/mm.h>
L
Linus Torvalds 已提交
21 22
#include <asm/proto.h>
#include <asm/numa.h>
23
#include <asm/e820.h>
I
Ingo Molnar 已提交
24
#include <asm/apic.h>
I
Ingo Molnar 已提交
25
#include <asm/uv/uv.h>
L
Linus Torvalds 已提交
26

A
Andi Kleen 已提交
27 28
int acpi_numa __initdata;

L
Linus Torvalds 已提交
29 30
static __init int setup_node(int pxm)
{
31
	return acpi_map_pxm_to_node(pxm);
L
Linus Torvalds 已提交
32 33 34 35 36 37 38 39 40 41
}

static __init void bad_srat(void)
{
	printk(KERN_ERR "SRAT: SRAT not used.\n");
	acpi_numa = -1;
}

static __init inline int srat_disabled(void)
{
42
	return acpi_numa < 0;
L
Linus Torvalds 已提交
43 44 45 46 47
}

/* Callback for SLIT parsing */
void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
{
48
	int i, j;
Y
Yinghai Lu 已提交
49

50 51 52 53
	for (i = 0; i < slit->locality_count; i++)
		for (j = 0; j < slit->locality_count; j++)
			numa_set_distance(pxm_to_node(i), pxm_to_node(j),
				slit->entry[slit->locality_count * i + j]);
L
Linus Torvalds 已提交
54 55
}

56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
/* Callback for Proximity Domain -> x2APIC mapping */
void __init
acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
{
	int pxm, node;
	int apic_id;

	if (srat_disabled())
		return;
	if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
		bad_srat();
		return;
	}
	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
		return;
	pxm = pa->proximity_domain;
72
	apic_id = pa->apic_id;
73
	if (!apic->apic_id_valid(apic_id)) {
74 75 76 77
		printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n",
			 pxm, apic_id);
		return;
	}
78 79 80 81 82 83 84
	node = setup_node(pxm);
	if (node < 0) {
		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
		bad_srat();
		return;
	}

85 86 87 88
	if (apic_id >= MAX_LOCAL_APIC) {
		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
		return;
	}
89
	set_apicid_to_node(apic_id, node);
90
	node_set(node, numa_nodes_parsed);
91
	acpi_numa = 1;
92
	printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
93 94 95
	       pxm, apic_id, node);
}

L
Linus Torvalds 已提交
96 97
/* Callback for Proximity Domain -> LAPIC mapping */
void __init
98
acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
L
Linus Torvalds 已提交
99 100
{
	int pxm, node;
101 102
	int apic_id;

103 104
	if (srat_disabled())
		return;
105
	if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
106
		bad_srat();
107 108
		return;
	}
109
	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
L
Linus Torvalds 已提交
110
		return;
111
	pxm = pa->proximity_domain_lo;
112 113
	if (acpi_srat_revision >= 2)
		pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8;
L
Linus Torvalds 已提交
114 115 116 117 118 119
	node = setup_node(pxm);
	if (node < 0) {
		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
		bad_srat();
		return;
	}
120

121
	if (get_uv_system_type() >= UV_X2APIC)
J
Jack Steiner 已提交
122 123 124
		apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
	else
		apic_id = pa->apic_id;
125 126 127 128 129 130

	if (apic_id >= MAX_LOCAL_APIC) {
		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
		return;
	}

131
	set_apicid_to_node(apic_id, node);
132
	node_set(node, numa_nodes_parsed);
L
Linus Torvalds 已提交
133
	acpi_numa = 1;
134
	printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
135
	       pxm, apic_id, node);
L
Linus Torvalds 已提交
136 137
}

138
#ifdef CONFIG_MEMORY_HOTPLUG
139 140 141 142
static inline int save_add_info(void) {return 1;}
#else
static inline int save_add_info(void) {return 0;}
#endif
143

144
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
145 146
static void __init
handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable)
147
{
148
	int overlap, i;
149 150 151 152 153 154
	unsigned long start_pfn, end_pfn;

	start_pfn = PFN_DOWN(start);
	end_pfn = PFN_UP(end);

	/*
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
	 * For movablemem_map=acpi:
	 *
	 * SRAT:		|_____| |_____| |_________| |_________| ......
	 * node id:                0       1         1           2
	 * hotpluggable:	   n       y         y           n
	 * movablemem_map:	        |_____| |_________|
	 *
	 * Using movablemem_map, we can prevent memblock from allocating memory
	 * on ZONE_MOVABLE at boot time.
	 *
	 * Before parsing SRAT, memblock has already reserve some memory ranges
	 * for other purposes, such as for kernel image. We cannot prevent
	 * kernel from using these memory, so we need to exclude these memory
	 * even if it is hotpluggable.
	 * Furthermore, to ensure the kernel has enough memory to boot, we make
	 * all the memory on the node which the kernel resides in
	 * un-hotpluggable.
	 */
	if (hotpluggable && movablemem_map.acpi) {
		/* Exclude ranges reserved by memblock. */
		struct memblock_type *rgn = &memblock.reserved;

		for (i = 0; i < rgn->cnt; i++) {
			if (end <= rgn->regions[i].base ||
			    start >= rgn->regions[i].base +
			    rgn->regions[i].size)
				continue;

			/*
			 * If the memory range overlaps the memory reserved by
			 * memblock, then the kernel resides in this node.
			 */
			node_set(node, movablemem_map.numa_nodes_kernel);

			goto out;
		}

		/*
		 * If the kernel resides in this node, then the whole node
		 * should not be hotpluggable.
		 */
		if (node_isset(node, movablemem_map.numa_nodes_kernel))
			goto out;

		insert_movablemem_map(start_pfn, end_pfn);

		/*
		 * numa_nodes_hotplug nodemask represents which nodes are put
		 * into movablemem_map.map[].
		 */
		node_set(node, movablemem_map.numa_nodes_hotplug);
		goto out;
	}

	/*
	 * For movablemem_map=nn[KMG]@ss[KMG]:
211 212 213 214 215 216 217 218
	 *
	 * SRAT:		|_____| |_____| |_________| |_________| ......
	 * node id:		   0       1         1           2
	 * user specified:	          |__|                 |___|
	 * movablemem_map:		  |___| |_________|    |______| ......
	 *
	 * Using movablemem_map, we can prevent memblock from allocating memory
	 * on ZONE_MOVABLE at boot time.
219 220
	 *
	 * NOTE: In this case, SRAT info will be ingored.
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247
	 */
	overlap = movablemem_map_overlap(start_pfn, end_pfn);
	if (overlap >= 0) {
		/*
		 * If part of this range is in movablemem_map, we need to
		 * add the range after it to extend the range to the end
		 * of the node, because from the min address specified to
		 * the end of the node will be ZONE_MOVABLE.
		 */
		start_pfn = max(start_pfn,
			    movablemem_map.map[overlap].start_pfn);
		insert_movablemem_map(start_pfn, end_pfn);

		/*
		 * Set the nodemask, so that if the address range on one node
		 * is not continuse, we can add the subsequent ranges on the
		 * same node into movablemem_map.
		 */
		node_set(node, movablemem_map.numa_nodes_hotplug);
	} else {
		if (node_isset(node, movablemem_map.numa_nodes_hotplug))
			/*
			 * Insert the range if we already have movable ranges
			 * on the same node.
			 */
			insert_movablemem_map(start_pfn, end_pfn);
	}
248 249
out:
	return;
250 251
}
#else		/* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
252 253
static inline void
handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable)
254 255 256 257
{
}
#endif		/* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

L
Linus Torvalds 已提交
258
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
259
int __init
260
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
L
Linus Torvalds 已提交
261
{
T
Tejun Heo 已提交
262
	u64 start, end;
263
	u32 hotpluggable;
L
Linus Torvalds 已提交
264 265
	int node, pxm;

266
	if (srat_disabled())
267 268 269
		goto out_err;
	if (ma->header.length != sizeof(struct acpi_srat_mem_affinity))
		goto out_err_bad_srat;
270
	if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
271
		goto out_err;
272 273
	hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE;
	if (hotpluggable && !save_add_info())
274 275
		goto out_err;

276 277
	start = ma->base_address;
	end = start + ma->length;
L
Linus Torvalds 已提交
278
	pxm = ma->proximity_domain;
279 280
	if (acpi_srat_revision <= 1)
		pxm &= 0xff;
281

L
Linus Torvalds 已提交
282 283 284
	node = setup_node(pxm);
	if (node < 0) {
		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
285
		goto out_err_bad_srat;
L
Linus Torvalds 已提交
286
	}
287

288 289
	if (numa_add_memblk(node, start, end) < 0)
		goto out_err_bad_srat;
290

291 292
	node_set(node, numa_nodes_parsed);

293
	printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx] %s\n",
294
	       node, pxm,
295 296 297
	       (unsigned long long) start, (unsigned long long) end - 1,
	       hotpluggable ? "Hot Pluggable": "");

298
	handle_movablemem(node, start, end, hotpluggable);
299

300
	return 0;
301 302 303 304
out_err_bad_srat:
	bad_srat();
out_err:
	return -1;
L
Linus Torvalds 已提交
305 306 307 308
}

void __init acpi_numa_arch_fixup(void) {}

309 310 311 312 313 314 315 316 317
int __init x86_acpi_numa_init(void)
{
	int ret;

	ret = acpi_numa_init();
	if (ret < 0)
		return ret;
	return srat_disabled() ? -EINVAL : 0;
}