srat.c 12.1 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 * ACPI 3.0 based NUMA setup
 * Copyright 2004 Andi Kleen, SuSE Labs.
 *
 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
 *
 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
 * Assumes all memory regions belonging to a single proximity domain
 * are in one chunk. Holes between them will be included in the node.
 */

#include <linux/kernel.h>
#include <linux/acpi.h>
#include <linux/mmzone.h>
#include <linux/bitmap.h>
#include <linux/module.h>
#include <linux/topology.h>
18 19
#include <linux/bootmem.h>
#include <linux/mm.h>
L
Linus Torvalds 已提交
20 21
#include <asm/proto.h>
#include <asm/numa.h>
22
#include <asm/e820.h>
L
Linus Torvalds 已提交
23

A
Andi Kleen 已提交
24 25
int acpi_numa __initdata;

L
Linus Torvalds 已提交
26 27 28
static struct acpi_table_slit *acpi_slit;

static nodemask_t nodes_parsed __initdata;
29
static struct bootnode nodes[MAX_NUMNODES] __initdata;
30
static struct bootnode nodes_add[MAX_NUMNODES];
31
static int found_add_area __initdata;
32
int hotadd_percent __initdata = 0;
L
Linus Torvalds 已提交
33

34 35 36 37
/* Too small nodes confuse the VM badly. Usually they result
   from BIOS bugs. */
#define NODE_MIN_SIZE (4*1024*1024)

L
Linus Torvalds 已提交
38 39
static __init int setup_node(int pxm)
{
40
	return acpi_map_pxm_to_node(pxm);
L
Linus Torvalds 已提交
41 42 43 44 45
}

static __init int conflicting_nodes(unsigned long start, unsigned long end)
{
	int i;
46
	for_each_node_mask(i, nodes_parsed) {
47
		struct bootnode *nd = &nodes[i];
L
Linus Torvalds 已提交
48 49 50
		if (nd->start == nd->end)
			continue;
		if (nd->end > start && nd->start < end)
51
			return i;
L
Linus Torvalds 已提交
52
		if (nd->end == end && nd->start == start)
53
			return i;
L
Linus Torvalds 已提交
54 55 56 57 58 59
	}
	return -1;
}

static __init void cutoff_node(int i, unsigned long start, unsigned long end)
{
60
	struct bootnode *nd = &nodes[i];
61 62 63 64

	if (found_add_area)
		return;

L
Linus Torvalds 已提交
65 66 67 68 69 70 71 72 73 74 75 76 77 78
	if (nd->start < start) {
		nd->start = start;
		if (nd->end < nd->start)
			nd->start = nd->end;
	}
	if (nd->end > end) {
		nd->end = end;
		if (nd->start > nd->end)
			nd->start = nd->end;
	}
}

static __init void bad_srat(void)
{
79
	int i;
L
Linus Torvalds 已提交
80 81
	printk(KERN_ERR "SRAT: SRAT not used.\n");
	acpi_numa = -1;
82
	found_add_area = 0;
83 84
	for (i = 0; i < MAX_LOCAL_APIC; i++)
		apicid_to_node[i] = NUMA_NO_NODE;
85 86
	for (i = 0; i < MAX_NUMNODES; i++)
		nodes_add[i].start = nodes[i].end = 0;
87
	remove_all_active_ranges();
L
Linus Torvalds 已提交
88 89 90 91 92 93 94
}

static __init inline int srat_disabled(void)
{
	return numa_off || acpi_numa < 0;
}

A
Andi Kleen 已提交
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
/*
 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
 * up the NUMA heuristics which wants the local node to have a smaller
 * distance than the others.
 * Do some quick checks here and only use the SLIT if it passes.
 */
static __init int slit_valid(struct acpi_table_slit *slit)
{
	int i, j;
	int d = slit->localities;
	for (i = 0; i < d; i++) {
		for (j = 0; j < d; j++)  {
			u8 val = slit->entry[d*i + j];
			if (i == j) {
				if (val != 10)
					return 0;
			} else if (val <= 10)
				return 0;
		}
	}
	return 1;
}

L
Linus Torvalds 已提交
118 119 120
/* Callback for SLIT parsing */
void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
{
A
Andi Kleen 已提交
121 122 123 124
	if (!slit_valid(slit)) {
		printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
		return;
	}
L
Linus Torvalds 已提交
125 126 127 128 129 130 131 132
	acpi_slit = slit;
}

/* Callback for Proximity Domain -> LAPIC mapping */
void __init
acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
{
	int pxm, node;
133 134
	if (srat_disabled())
		return;
135 136
	if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) {
		bad_srat();
137 138 139
		return;
	}
	if (pa->flags.enabled == 0)
L
Linus Torvalds 已提交
140 141 142 143 144 145 146 147
		return;
	pxm = pa->proximity_domain;
	node = setup_node(pxm);
	if (node < 0) {
		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
		bad_srat();
		return;
	}
148
	apicid_to_node[pa->apic_id] = node;
L
Linus Torvalds 已提交
149
	acpi_numa = 1;
150 151
	printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
	       pxm, pa->apic_id, node);
L
Linus Torvalds 已提交
152 153
}

154
#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
/*
 * Protect against too large hotadd areas that would fill up memory.
 */
static int hotadd_enough_memory(struct bootnode *nd)
{
	static unsigned long allocated;
	static unsigned long last_area_end;
	unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
	long mem = pages * sizeof(struct page);
	unsigned long addr;
	unsigned long allowed;
	unsigned long oldpages = pages;

	if (mem < 0)
		return 0;
170
	allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE;
171 172
	allowed = (allowed / 100) * hotadd_percent;
	if (allocated + mem > allowed) {
173
		unsigned long range;
174 175 176 177 178 179
		/* Give them at least part of their hotadd memory upto hotadd_percent
		   It would be better to spread the limit out
		   over multiple hotplug areas, but that is too complicated
		   right now */
		if (allocated >= allowed)
			return 0;
180 181
		range = allowed - allocated;
		pages = (range / PAGE_SIZE);
182
		mem = pages * sizeof(struct page);
183
		nd->end = nd->start + range;
184 185 186 187 188 189 190 191 192 193 194 195 196
	}
	/* Not completely fool proof, but a good sanity check */
	addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
	if (addr == -1UL)
		return 0;
	if (pages != oldpages)
		printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
			pages << PAGE_SHIFT);
	last_area_end = addr + mem;
	allocated += mem;
	return 1;
}

197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
static int update_end_of_memory(unsigned long end)
{
	found_add_area = 1;
	if ((end >> PAGE_SHIFT) > end_pfn)
		end_pfn = end >> PAGE_SHIFT;
	return 1;
}

static inline int save_add_info(void)
{
	return hotadd_percent > 0;
}
#else
int update_end_of_memory(unsigned long end) {return 0;}
static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
static inline int save_add_info(void) {return 1;}
#else
static inline int save_add_info(void) {return 0;}
#endif
#endif
218
/*
219 220
 * Update nodes_add and decide if to include add are in the zone.
 * Both SPARSE and RESERVE need nodes_add infomation.
221 222 223 224 225 226
 * This code supports one contigious hot add area per node.
 */
static int reserve_hotadd(int node, unsigned long start, unsigned long end)
{
	unsigned long s_pfn = start >> PAGE_SHIFT;
	unsigned long e_pfn = end >> PAGE_SHIFT;
227
	int ret = 0, changed = 0;
228 229 230 231 232 233 234 235 236 237 238 239 240 241
	struct bootnode *nd = &nodes_add[node];

	/* I had some trouble with strange memory hotadd regions breaking
	   the boot. Be very strict here and reject anything unexpected.
	   If you want working memory hotadd write correct SRATs.

	   The node size check is a basic sanity check to guard against
	   mistakes */
	if ((signed long)(end - start) < NODE_MIN_SIZE) {
		printk(KERN_ERR "SRAT: Hotplug area too small\n");
		return -1;
	}

	/* This check might be a bit too strict, but I'm keeping it for now. */
242
	if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
243 244 245
		printk(KERN_ERR
			"SRAT: Hotplug area %lu -> %lu has existing memory\n",
			s_pfn, e_pfn);
246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272
		return -1;
	}

	if (!hotadd_enough_memory(&nodes_add[node]))  {
		printk(KERN_ERR "SRAT: Hotplug area too large\n");
		return -1;
	}

	/* Looks good */

	if (nd->start == nd->end) {
 		nd->start = start;
 		nd->end = end;
		changed = 1;
 	} else {
 		if (nd->start == end) {
 			nd->start = start;
			changed = 1;
		}
 		if (nd->end == start) {
 			nd->end = end;
			changed = 1;
		}
		if (!changed)
			printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
 	}

273
	ret = update_end_of_memory(nd->end);
274 275 276

	if (changed)
	 	printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
277
	return ret;
278 279
}

L
Linus Torvalds 已提交
280 281 282 283
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
void __init
acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
{
284
	struct bootnode *nd, oldnode;
L
Linus Torvalds 已提交
285 286 287 288
	unsigned long start, end;
	int node, pxm;
	int i;

289
	if (srat_disabled())
L
Linus Torvalds 已提交
290
		return;
291 292 293 294 295 296
	if (ma->header.length != sizeof(struct acpi_table_memory_affinity)) {
		bad_srat();
		return;
	}
	if (ma->flags.enabled == 0)
		return;
297
 	if (ma->flags.hot_pluggable && !save_add_info())
298
		return;
299 300
	start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
	end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
L
Linus Torvalds 已提交
301 302 303 304 305 306 307 308
	pxm = ma->proximity_domain;
	node = setup_node(pxm);
	if (node < 0) {
		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
		bad_srat();
		return;
	}
	i = conflicting_nodes(start, end);
309 310 311 312 313
	if (i == node) {
		printk(KERN_WARNING
		"SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
			pxm, start, end, nodes[i].start, nodes[i].end);
	} else if (i >= 0) {
L
Linus Torvalds 已提交
314
		printk(KERN_ERR
315 316 317
		       "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
		       pxm, start, end, node_to_pxm(i),
			nodes[i].start, nodes[i].end);
L
Linus Torvalds 已提交
318 319 320 321
		bad_srat();
		return;
	}
	nd = &nodes[node];
322
	oldnode = *nd;
L
Linus Torvalds 已提交
323 324 325 326 327 328 329 330 331
	if (!node_test_and_set(node, nodes_parsed)) {
		nd->start = start;
		nd->end = end;
	} else {
		if (start < nd->start)
			nd->start = start;
		if (nd->end < end)
			nd->end = end;
	}
332

L
Linus Torvalds 已提交
333 334
	printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
	       nd->start, nd->end);
335 336
	e820_register_active_regions(node, nd->start >> PAGE_SHIFT,
						nd->end >> PAGE_SHIFT);
337 338
	push_node_boundaries(node, nd->start >> PAGE_SHIFT,
						nd->end >> PAGE_SHIFT);
339

340
 	if (ma->flags.hot_pluggable && !reserve_hotadd(node, start, end) < 0) {
341 342 343 344 345 346
		/* Ignore hotadd region. Undo damage */
		printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
		*nd = oldnode;
		if ((nd->start | nd->end) == 0)
			node_clear(node, nodes_parsed);
	}
L
Linus Torvalds 已提交
347 348
}

349 350 351 352 353 354 355 356 357 358 359 360
/* Sanity check to catch more bad SRATs (they are amazingly common).
   Make sure the PXMs cover all memory. */
static int nodes_cover_memory(void)
{
	int i;
	unsigned long pxmram, e820ram;

	pxmram = 0;
	for_each_node_mask(i, nodes_parsed) {
		unsigned long s = nodes[i].start >> PAGE_SHIFT;
		unsigned long e = nodes[i].end >> PAGE_SHIFT;
		pxmram += e - s;
361
		pxmram -= absent_pages_in_range(s, e);
362 363
		if ((long)pxmram < 0)
			pxmram = 0;
364 365
	}

366
	e820ram = end_pfn - absent_pages_in_range(0, end_pfn);
367 368
	/* We seem to lose 3 pages somewhere. Allow a bit of slack. */
	if ((long)(e820ram - pxmram) >= 1*1024*1024) {
369 370 371 372 373 374 375 376 377
		printk(KERN_ERR
	"SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
			(pxmram << PAGE_SHIFT) >> 20,
			(e820ram << PAGE_SHIFT) >> 20);
		return 0;
	}
	return 1;
}

378 379 380 381 382 383 384 385 386 387
static void unparse_node(int node)
{
	int i;
	node_clear(node, nodes_parsed);
	for (i = 0; i < MAX_LOCAL_APIC; i++) {
		if (apicid_to_node[i] == node)
			apicid_to_node[i] = NUMA_NO_NODE;
	}
}

L
Linus Torvalds 已提交
388 389 390 391 392 393
void __init acpi_numa_arch_fixup(void) {}

/* Use the information discovered above to actually set up the nodes. */
int __init acpi_scan_nodes(unsigned long start, unsigned long end)
{
	int i;
394

395
	/* First clean up the node list */
396
	for (i = 0; i < MAX_NUMNODES; i++) {
397
 		cutoff_node(i, start, end);
398
		if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
399
			unparse_node(i);
400 401
			node_set_offline(i);
		}
402 403
	}

404 405 406
	if (acpi_numa <= 0)
		return -1;

407 408 409 410 411
	if (!nodes_cover_memory()) {
		bad_srat();
		return -1;
	}

412
	memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
L
Linus Torvalds 已提交
413 414 415 416 417 418
	if (memnode_shift < 0) {
		printk(KERN_ERR
		     "SRAT: No NUMA node hash function found. Contact maintainer\n");
		bad_srat();
		return -1;
	}
419 420 421

	/* Finally register nodes */
	for_each_node_mask(i, nodes_parsed)
L
Linus Torvalds 已提交
422
		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
423 424 425 426 427 428
	/* Try again in case setup_node_bootmem missed one due
	   to missing bootmem */
	for_each_node_mask(i, nodes_parsed)
		if (!node_online(i))
			setup_node_bootmem(i, nodes[i].start, nodes[i].end);

L
Linus Torvalds 已提交
429 430 431 432
	for (i = 0; i < NR_CPUS; i++) { 
		if (cpu_to_node[i] == NUMA_NO_NODE)
			continue;
		if (!node_isset(cpu_to_node[i], nodes_parsed))
433
			numa_set_node(i, NUMA_NO_NODE);
L
Linus Torvalds 已提交
434 435 436 437 438
	}
	numa_init_array();
	return 0;
}

439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457
void __init srat_reserve_add_area(int nodeid)
{
	if (found_add_area && nodes_add[nodeid].end) {
		u64 total_mb;

		printk(KERN_INFO "SRAT: Reserving hot-add memory space "
				"for node %d at %Lx-%Lx\n",
			nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
		total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
					>> PAGE_SHIFT;
		total_mb *= sizeof(struct page);
		total_mb >>= 20;
		printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
				"pre-allocated memory.\n", (unsigned long long)total_mb);
		reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
			       nodes_add[nodeid].end - nodes_add[nodeid].start);
	}
}

L
Linus Torvalds 已提交
458 459 460 461 462 463 464 465 466 467 468
int __node_distance(int a, int b)
{
	int index;

	if (!acpi_slit)
		return a == b ? 10 : 20;
	index = acpi_slit->localities * node_to_pxm(a);
	return acpi_slit->entry[index + node_to_pxm(b)];
}

EXPORT_SYMBOL(__node_distance);
469 470 471 472 473 474 475 476 477 478 479

int memory_add_physaddr_to_nid(u64 start)
{
	int i, ret = 0;

	for_each_node(i)
		if (nodes_add[i].start <= start && nodes_add[i].end > start)
			ret = i;

	return ret;
}
480 481
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);