srat.c 11.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 * ACPI 3.0 based NUMA setup
 * Copyright 2004 Andi Kleen, SuSE Labs.
 *
 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
 *
 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
 * Assumes all memory regions belonging to a single proximity domain
 * are in one chunk. Holes between them will be included in the node.
 */

#include <linux/kernel.h>
#include <linux/acpi.h>
#include <linux/mmzone.h>
#include <linux/bitmap.h>
#include <linux/module.h>
#include <linux/topology.h>
18 19
#include <linux/bootmem.h>
#include <linux/mm.h>
L
Linus Torvalds 已提交
20 21
#include <asm/proto.h>
#include <asm/numa.h>
22
#include <asm/e820.h>
L
Linus Torvalds 已提交
23

A
Andi Kleen 已提交
24 25
int acpi_numa __initdata;

26 27 28 29 30 31
#if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
	defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \
		&& !defined(CONFIG_MEMORY_HOTPLUG)
#define RESERVE_HOTADD 1
#endif

L
Linus Torvalds 已提交
32 33 34
static struct acpi_table_slit *acpi_slit;

static nodemask_t nodes_parsed __initdata;
35
static struct bootnode nodes[MAX_NUMNODES] __initdata;
36 37
static struct bootnode nodes_add[MAX_NUMNODES] __initdata;
static int found_add_area __initdata;
38 39 40 41
int hotadd_percent __initdata = 0;
#ifndef RESERVE_HOTADD
#define hotadd_percent 0	/* Ignore all settings */
#endif
L
Linus Torvalds 已提交
42

43 44 45 46
/* Too small nodes confuse the VM badly. Usually they result
   from BIOS bugs. */
#define NODE_MIN_SIZE (4*1024*1024)

L
Linus Torvalds 已提交
47 48
static __init int setup_node(int pxm)
{
49
	return acpi_map_pxm_to_node(pxm);
L
Linus Torvalds 已提交
50 51 52 53 54
}

static __init int conflicting_nodes(unsigned long start, unsigned long end)
{
	int i;
55
	for_each_node_mask(i, nodes_parsed) {
56
		struct bootnode *nd = &nodes[i];
L
Linus Torvalds 已提交
57 58 59
		if (nd->start == nd->end)
			continue;
		if (nd->end > start && nd->start < end)
60
			return i;
L
Linus Torvalds 已提交
61
		if (nd->end == end && nd->start == start)
62
			return i;
L
Linus Torvalds 已提交
63 64 65 66 67 68
	}
	return -1;
}

static __init void cutoff_node(int i, unsigned long start, unsigned long end)
{
69
	struct bootnode *nd = &nodes[i];
70 71 72 73

	if (found_add_area)
		return;

L
Linus Torvalds 已提交
74 75 76 77 78 79 80 81 82 83 84 85 86 87
	if (nd->start < start) {
		nd->start = start;
		if (nd->end < nd->start)
			nd->start = nd->end;
	}
	if (nd->end > end) {
		nd->end = end;
		if (nd->start > nd->end)
			nd->start = nd->end;
	}
}

static __init void bad_srat(void)
{
88
	int i;
L
Linus Torvalds 已提交
89 90
	printk(KERN_ERR "SRAT: SRAT not used.\n");
	acpi_numa = -1;
91
	found_add_area = 0;
92 93
	for (i = 0; i < MAX_LOCAL_APIC; i++)
		apicid_to_node[i] = NUMA_NO_NODE;
94 95
	for (i = 0; i < MAX_NUMNODES; i++)
		nodes_add[i].start = nodes[i].end = 0;
96
	remove_all_active_ranges();
L
Linus Torvalds 已提交
97 98 99 100 101 102 103
}

static __init inline int srat_disabled(void)
{
	return numa_off || acpi_numa < 0;
}

A
Andi Kleen 已提交
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
/*
 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
 * up the NUMA heuristics which wants the local node to have a smaller
 * distance than the others.
 * Do some quick checks here and only use the SLIT if it passes.
 */
static __init int slit_valid(struct acpi_table_slit *slit)
{
	int i, j;
	int d = slit->localities;
	for (i = 0; i < d; i++) {
		for (j = 0; j < d; j++)  {
			u8 val = slit->entry[d*i + j];
			if (i == j) {
				if (val != 10)
					return 0;
			} else if (val <= 10)
				return 0;
		}
	}
	return 1;
}

L
Linus Torvalds 已提交
127 128 129
/* Callback for SLIT parsing */
void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
{
A
Andi Kleen 已提交
130 131 132 133
	if (!slit_valid(slit)) {
		printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
		return;
	}
L
Linus Torvalds 已提交
134 135 136 137 138 139 140 141
	acpi_slit = slit;
}

/* Callback for Proximity Domain -> LAPIC mapping */
void __init
acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
{
	int pxm, node;
142 143
	if (srat_disabled())
		return;
144 145
	if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) {
		bad_srat();
146 147 148
		return;
	}
	if (pa->flags.enabled == 0)
L
Linus Torvalds 已提交
149 150 151 152 153 154 155 156
		return;
	pxm = pa->proximity_domain;
	node = setup_node(pxm);
	if (node < 0) {
		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
		bad_srat();
		return;
	}
157
	apicid_to_node[pa->apic_id] = node;
L
Linus Torvalds 已提交
158
	acpi_numa = 1;
159 160
	printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
	       pxm, pa->apic_id, node);
L
Linus Torvalds 已提交
161 162
}

163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
#ifdef RESERVE_HOTADD
/*
 * Protect against too large hotadd areas that would fill up memory.
 */
static int hotadd_enough_memory(struct bootnode *nd)
{
	static unsigned long allocated;
	static unsigned long last_area_end;
	unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
	long mem = pages * sizeof(struct page);
	unsigned long addr;
	unsigned long allowed;
	unsigned long oldpages = pages;

	if (mem < 0)
		return 0;
179
	allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE;
180 181
	allowed = (allowed / 100) * hotadd_percent;
	if (allocated + mem > allowed) {
182
		unsigned long range;
183 184 185 186 187 188
		/* Give them at least part of their hotadd memory upto hotadd_percent
		   It would be better to spread the limit out
		   over multiple hotplug areas, but that is too complicated
		   right now */
		if (allocated >= allowed)
			return 0;
189 190
		range = allowed - allocated;
		pages = (range / PAGE_SIZE);
191
		mem = pages * sizeof(struct page);
192
		nd->end = nd->start + range;
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
	}
	/* Not completely fool proof, but a good sanity check */
	addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
	if (addr == -1UL)
		return 0;
	if (pages != oldpages)
		printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
			pages << PAGE_SHIFT);
	last_area_end = addr + mem;
	allocated += mem;
	return 1;
}

/*
 * It is fine to add this area to the nodes data it will be used later
 * This code supports one contigious hot add area per node.
 */
static int reserve_hotadd(int node, unsigned long start, unsigned long end)
{
	unsigned long s_pfn = start >> PAGE_SHIFT;
	unsigned long e_pfn = end >> PAGE_SHIFT;
	int changed = 0;
	struct bootnode *nd = &nodes_add[node];

	/* I had some trouble with strange memory hotadd regions breaking
	   the boot. Be very strict here and reject anything unexpected.
	   If you want working memory hotadd write correct SRATs.

	   The node size check is a basic sanity check to guard against
	   mistakes */
	if ((signed long)(end - start) < NODE_MIN_SIZE) {
		printk(KERN_ERR "SRAT: Hotplug area too small\n");
		return -1;
	}

	/* This check might be a bit too strict, but I'm keeping it for now. */
229
	if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
230 231 232
		printk(KERN_ERR
			"SRAT: Hotplug area %lu -> %lu has existing memory\n",
			s_pfn, e_pfn);
233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
		return -1;
	}

	if (!hotadd_enough_memory(&nodes_add[node]))  {
		printk(KERN_ERR "SRAT: Hotplug area too large\n");
		return -1;
	}

	/* Looks good */

 	found_add_area = 1;
	if (nd->start == nd->end) {
 		nd->start = start;
 		nd->end = end;
		changed = 1;
 	} else {
 		if (nd->start == end) {
 			nd->start = start;
			changed = 1;
		}
 		if (nd->end == start) {
 			nd->end = end;
			changed = 1;
		}
		if (!changed)
			printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
 	}

 	if ((nd->end >> PAGE_SHIFT) > end_pfn)
 		end_pfn = nd->end >> PAGE_SHIFT;

	if (changed)
	 	printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
	return 0;
}
#endif

L
Linus Torvalds 已提交
270 271 272 273
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
void __init
acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
{
274
	struct bootnode *nd, oldnode;
L
Linus Torvalds 已提交
275 276 277 278
	unsigned long start, end;
	int node, pxm;
	int i;

279
	if (srat_disabled())
L
Linus Torvalds 已提交
280
		return;
281 282 283 284 285 286
	if (ma->header.length != sizeof(struct acpi_table_memory_affinity)) {
		bad_srat();
		return;
	}
	if (ma->flags.enabled == 0)
		return;
287 288
 	if (ma->flags.hot_pluggable && hotadd_percent == 0)
		return;
289 290
	start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
	end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
L
Linus Torvalds 已提交
291 292 293 294 295 296 297 298
	pxm = ma->proximity_domain;
	node = setup_node(pxm);
	if (node < 0) {
		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
		bad_srat();
		return;
	}
	i = conflicting_nodes(start, end);
299 300 301 302 303
	if (i == node) {
		printk(KERN_WARNING
		"SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
			pxm, start, end, nodes[i].start, nodes[i].end);
	} else if (i >= 0) {
L
Linus Torvalds 已提交
304
		printk(KERN_ERR
305 306 307
		       "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
		       pxm, start, end, node_to_pxm(i),
			nodes[i].start, nodes[i].end);
L
Linus Torvalds 已提交
308 309 310 311
		bad_srat();
		return;
	}
	nd = &nodes[node];
312
	oldnode = *nd;
L
Linus Torvalds 已提交
313 314 315 316 317 318 319 320 321
	if (!node_test_and_set(node, nodes_parsed)) {
		nd->start = start;
		nd->end = end;
	} else {
		if (start < nd->start)
			nd->start = start;
		if (nd->end < end)
			nd->end = end;
	}
322

L
Linus Torvalds 已提交
323 324
	printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
	       nd->start, nd->end);
325 326
	e820_register_active_regions(node, nd->start >> PAGE_SHIFT,
						nd->end >> PAGE_SHIFT);
327 328 329 330 331 332 333 334 335 336

#ifdef RESERVE_HOTADD
 	if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
		/* Ignore hotadd region. Undo damage */
		printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
		*nd = oldnode;
		if ((nd->start | nd->end) == 0)
			node_clear(node, nodes_parsed);
	}
#endif
L
Linus Torvalds 已提交
337 338
}

339 340 341 342 343 344 345 346 347 348 349 350
/* Sanity check to catch more bad SRATs (they are amazingly common).
   Make sure the PXMs cover all memory. */
static int nodes_cover_memory(void)
{
	int i;
	unsigned long pxmram, e820ram;

	pxmram = 0;
	for_each_node_mask(i, nodes_parsed) {
		unsigned long s = nodes[i].start >> PAGE_SHIFT;
		unsigned long e = nodes[i].end >> PAGE_SHIFT;
		pxmram += e - s;
351
		pxmram -= absent_pages_in_range(s, e);
352 353 354
		pxmram -= nodes_add[i].end - nodes_add[i].start;
		if ((long)pxmram < 0)
			pxmram = 0;
355 356
	}

357
	e820ram = end_pfn - absent_pages_in_range(0, end_pfn);
358 359
	/* We seem to lose 3 pages somewhere. Allow a bit of slack. */
	if ((long)(e820ram - pxmram) >= 1*1024*1024) {
360 361 362 363 364 365 366 367 368
		printk(KERN_ERR
	"SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
			(pxmram << PAGE_SHIFT) >> 20,
			(e820ram << PAGE_SHIFT) >> 20);
		return 0;
	}
	return 1;
}

369 370 371 372 373 374 375 376 377 378
static void unparse_node(int node)
{
	int i;
	node_clear(node, nodes_parsed);
	for (i = 0; i < MAX_LOCAL_APIC; i++) {
		if (apicid_to_node[i] == node)
			apicid_to_node[i] = NUMA_NO_NODE;
	}
}

L
Linus Torvalds 已提交
379 380 381 382 383 384
void __init acpi_numa_arch_fixup(void) {}

/* Use the information discovered above to actually set up the nodes. */
int __init acpi_scan_nodes(unsigned long start, unsigned long end)
{
	int i;
385

386
	/* First clean up the node list */
387
	for (i = 0; i < MAX_NUMNODES; i++) {
388
 		cutoff_node(i, start, end);
389
		if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
390
			unparse_node(i);
391 392
			node_set_offline(i);
		}
393 394
	}

395 396 397
	if (acpi_numa <= 0)
		return -1;

398 399 400 401 402
	if (!nodes_cover_memory()) {
		bad_srat();
		return -1;
	}

403
	memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
L
Linus Torvalds 已提交
404 405 406 407 408 409
	if (memnode_shift < 0) {
		printk(KERN_ERR
		     "SRAT: No NUMA node hash function found. Contact maintainer\n");
		bad_srat();
		return -1;
	}
410 411 412

	/* Finally register nodes */
	for_each_node_mask(i, nodes_parsed)
L
Linus Torvalds 已提交
413
		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
414 415 416 417 418 419
	/* Try again in case setup_node_bootmem missed one due
	   to missing bootmem */
	for_each_node_mask(i, nodes_parsed)
		if (!node_online(i))
			setup_node_bootmem(i, nodes[i].start, nodes[i].end);

L
Linus Torvalds 已提交
420 421 422 423
	for (i = 0; i < NR_CPUS; i++) { 
		if (cpu_to_node[i] == NUMA_NO_NODE)
			continue;
		if (!node_isset(cpu_to_node[i], nodes_parsed))
424
			numa_set_node(i, NUMA_NO_NODE);
L
Linus Torvalds 已提交
425 426 427 428 429
	}
	numa_init_array();
	return 0;
}

430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448
void __init srat_reserve_add_area(int nodeid)
{
	if (found_add_area && nodes_add[nodeid].end) {
		u64 total_mb;

		printk(KERN_INFO "SRAT: Reserving hot-add memory space "
				"for node %d at %Lx-%Lx\n",
			nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
		total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
					>> PAGE_SHIFT;
		total_mb *= sizeof(struct page);
		total_mb >>= 20;
		printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
				"pre-allocated memory.\n", (unsigned long long)total_mb);
		reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
			       nodes_add[nodeid].end - nodes_add[nodeid].start);
	}
}

L
Linus Torvalds 已提交
449 450 451 452 453 454 455 456 457 458 459
int __node_distance(int a, int b)
{
	int index;

	if (!acpi_slit)
		return a == b ? 10 : 20;
	index = acpi_slit->localities * node_to_pxm(a);
	return acpi_slit->entry[index + node_to_pxm(b)];
}

EXPORT_SYMBOL(__node_distance);