srat.c 12.1 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 * ACPI 3.0 based NUMA setup
 * Copyright 2004 Andi Kleen, SuSE Labs.
 *
 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
 *
 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
 * Assumes all memory regions belonging to a single proximity domain
 * are in one chunk. Holes between them will be included in the node.
 */

#include <linux/kernel.h>
#include <linux/acpi.h>
#include <linux/mmzone.h>
#include <linux/bitmap.h>
#include <linux/module.h>
#include <linux/topology.h>
18 19
#include <linux/bootmem.h>
#include <linux/mm.h>
L
Linus Torvalds 已提交
20 21
#include <asm/proto.h>
#include <asm/numa.h>
22
#include <asm/e820.h>
L
Linus Torvalds 已提交
23

A
Andi Kleen 已提交
24 25
int acpi_numa __initdata;

L
Linus Torvalds 已提交
26 27 28
static struct acpi_table_slit *acpi_slit;

static nodemask_t nodes_parsed __initdata;
29
static struct bootnode nodes[MAX_NUMNODES] __initdata;
30
static struct bootnode nodes_add[MAX_NUMNODES];
31
static int found_add_area __initdata;
32
int hotadd_percent __initdata = 0;
L
Linus Torvalds 已提交
33

34 35 36 37
/* Too small nodes confuse the VM badly. Usually they result
   from BIOS bugs. */
#define NODE_MIN_SIZE (4*1024*1024)

L
Linus Torvalds 已提交
38 39
static __init int setup_node(int pxm)
{
40
	return acpi_map_pxm_to_node(pxm);
L
Linus Torvalds 已提交
41 42 43 44 45
}

static __init int conflicting_nodes(unsigned long start, unsigned long end)
{
	int i;
46
	for_each_node_mask(i, nodes_parsed) {
47
		struct bootnode *nd = &nodes[i];
L
Linus Torvalds 已提交
48 49 50
		if (nd->start == nd->end)
			continue;
		if (nd->end > start && nd->start < end)
51
			return i;
L
Linus Torvalds 已提交
52
		if (nd->end == end && nd->start == start)
53
			return i;
L
Linus Torvalds 已提交
54 55 56 57 58 59
	}
	return -1;
}

static __init void cutoff_node(int i, unsigned long start, unsigned long end)
{
60
	struct bootnode *nd = &nodes[i];
61 62 63 64

	if (found_add_area)
		return;

L
Linus Torvalds 已提交
65 66 67 68 69 70 71 72 73 74 75 76 77 78
	if (nd->start < start) {
		nd->start = start;
		if (nd->end < nd->start)
			nd->start = nd->end;
	}
	if (nd->end > end) {
		nd->end = end;
		if (nd->start > nd->end)
			nd->start = nd->end;
	}
}

static __init void bad_srat(void)
{
79
	int i;
L
Linus Torvalds 已提交
80 81
	printk(KERN_ERR "SRAT: SRAT not used.\n");
	acpi_numa = -1;
82
	found_add_area = 0;
83 84
	for (i = 0; i < MAX_LOCAL_APIC; i++)
		apicid_to_node[i] = NUMA_NO_NODE;
85 86
	for (i = 0; i < MAX_NUMNODES; i++)
		nodes_add[i].start = nodes[i].end = 0;
87
	remove_all_active_ranges();
L
Linus Torvalds 已提交
88 89 90 91 92 93 94
}

static __init inline int srat_disabled(void)
{
	return numa_off || acpi_numa < 0;
}

A
Andi Kleen 已提交
95 96 97 98 99 100 101 102 103
/*
 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
 * up the NUMA heuristics which wants the local node to have a smaller
 * distance than the others.
 * Do some quick checks here and only use the SLIT if it passes.
 */
static __init int slit_valid(struct acpi_table_slit *slit)
{
	int i, j;
104
	int d = slit->locality_count;
A
Andi Kleen 已提交
105 106 107 108 109 110 111 112 113 114 115 116 117
	for (i = 0; i < d; i++) {
		for (j = 0; j < d; j++)  {
			u8 val = slit->entry[d*i + j];
			if (i == j) {
				if (val != 10)
					return 0;
			} else if (val <= 10)
				return 0;
		}
	}
	return 1;
}

L
Linus Torvalds 已提交
118 119 120
/* Callback for SLIT parsing */
void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
{
A
Andi Kleen 已提交
121 122 123 124
	if (!slit_valid(slit)) {
		printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
		return;
	}
L
Linus Torvalds 已提交
125 126 127 128 129
	acpi_slit = slit;
}

/* Callback for Proximity Domain -> LAPIC mapping */
void __init
130
acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
L
Linus Torvalds 已提交
131 132
{
	int pxm, node;
133 134
	if (srat_disabled())
		return;
135
	if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
136
		bad_srat();
137 138
		return;
	}
139
	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
L
Linus Torvalds 已提交
140
		return;
141
	pxm = pa->proximity_domain_lo;
L
Linus Torvalds 已提交
142 143 144 145 146 147
	node = setup_node(pxm);
	if (node < 0) {
		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
		bad_srat();
		return;
	}
148
	apicid_to_node[pa->apic_id] = node;
L
Linus Torvalds 已提交
149
	acpi_numa = 1;
150 151
	printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
	       pxm, pa->apic_id, node);
L
Linus Torvalds 已提交
152 153
}

154
#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
/*
 * Protect against too large hotadd areas that would fill up memory.
 */
static int hotadd_enough_memory(struct bootnode *nd)
{
	static unsigned long allocated;
	static unsigned long last_area_end;
	unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
	long mem = pages * sizeof(struct page);
	unsigned long addr;
	unsigned long allowed;
	unsigned long oldpages = pages;

	if (mem < 0)
		return 0;
170
	allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE;
171 172
	allowed = (allowed / 100) * hotadd_percent;
	if (allocated + mem > allowed) {
173
		unsigned long range;
174 175 176 177 178 179
		/* Give them at least part of their hotadd memory upto hotadd_percent
		   It would be better to spread the limit out
		   over multiple hotplug areas, but that is too complicated
		   right now */
		if (allocated >= allowed)
			return 0;
180 181
		range = allowed - allocated;
		pages = (range / PAGE_SIZE);
182
		mem = pages * sizeof(struct page);
183
		nd->end = nd->start + range;
184 185 186 187 188 189 190 191 192 193 194 195 196
	}
	/* Not completely fool proof, but a good sanity check */
	addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
	if (addr == -1UL)
		return 0;
	if (pages != oldpages)
		printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
			pages << PAGE_SHIFT);
	last_area_end = addr + mem;
	allocated += mem;
	return 1;
}

197 198 199 200 201 202 203 204 205 206 207 208 209
static int update_end_of_memory(unsigned long end)
{
	found_add_area = 1;
	if ((end >> PAGE_SHIFT) > end_pfn)
		end_pfn = end >> PAGE_SHIFT;
	return 1;
}

static inline int save_add_info(void)
{
	return hotadd_percent > 0;
}
#else
210
int update_end_of_memory(unsigned long end) {return -1;}
211 212 213 214 215 216 217
static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
static inline int save_add_info(void) {return 1;}
#else
static inline int save_add_info(void) {return 0;}
#endif
#endif
218
/*
219 220
 * Update nodes_add and decide if to include add are in the zone.
 * Both SPARSE and RESERVE need nodes_add infomation.
221 222 223 224 225 226
 * This code supports one contigious hot add area per node.
 */
static int reserve_hotadd(int node, unsigned long start, unsigned long end)
{
	unsigned long s_pfn = start >> PAGE_SHIFT;
	unsigned long e_pfn = end >> PAGE_SHIFT;
227
	int ret = 0, changed = 0;
228 229 230 231 232 233 234 235 236 237 238 239 240 241
	struct bootnode *nd = &nodes_add[node];

	/* I had some trouble with strange memory hotadd regions breaking
	   the boot. Be very strict here and reject anything unexpected.
	   If you want working memory hotadd write correct SRATs.

	   The node size check is a basic sanity check to guard against
	   mistakes */
	if ((signed long)(end - start) < NODE_MIN_SIZE) {
		printk(KERN_ERR "SRAT: Hotplug area too small\n");
		return -1;
	}

	/* This check might be a bit too strict, but I'm keeping it for now. */
242
	if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
243 244 245
		printk(KERN_ERR
			"SRAT: Hotplug area %lu -> %lu has existing memory\n",
			s_pfn, e_pfn);
246 247 248 249 250 251 252 253 254 255 256
		return -1;
	}

	if (!hotadd_enough_memory(&nodes_add[node]))  {
		printk(KERN_ERR "SRAT: Hotplug area too large\n");
		return -1;
	}

	/* Looks good */

	if (nd->start == nd->end) {
257 258
		nd->start = start;
		nd->end = end;
259
		changed = 1;
260 261 262
	} else {
		if (nd->start == end) {
			nd->start = start;
263 264
			changed = 1;
		}
265 266
		if (nd->end == start) {
			nd->end = end;
267 268 269 270
			changed = 1;
		}
		if (!changed)
			printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
271
	}
272

273
	ret = update_end_of_memory(nd->end);
274 275 276

	if (changed)
	 	printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
277
	return ret;
278 279
}

L
Linus Torvalds 已提交
280 281
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
void __init
282
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
L
Linus Torvalds 已提交
283
{
284
	struct bootnode *nd, oldnode;
L
Linus Torvalds 已提交
285 286 287 288
	unsigned long start, end;
	int node, pxm;
	int i;

289
	if (srat_disabled())
L
Linus Torvalds 已提交
290
		return;
291
	if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
292 293 294
		bad_srat();
		return;
	}
295
	if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
296
		return;
297 298

	if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
299
		return;
300 301
	start = ma->base_address;
	end = start + ma->length;
L
Linus Torvalds 已提交
302 303 304 305 306 307 308 309
	pxm = ma->proximity_domain;
	node = setup_node(pxm);
	if (node < 0) {
		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
		bad_srat();
		return;
	}
	i = conflicting_nodes(start, end);
310 311 312 313 314
	if (i == node) {
		printk(KERN_WARNING
		"SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
			pxm, start, end, nodes[i].start, nodes[i].end);
	} else if (i >= 0) {
L
Linus Torvalds 已提交
315
		printk(KERN_ERR
316 317 318
		       "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
		       pxm, start, end, node_to_pxm(i),
			nodes[i].start, nodes[i].end);
L
Linus Torvalds 已提交
319 320 321 322
		bad_srat();
		return;
	}
	nd = &nodes[node];
323
	oldnode = *nd;
L
Linus Torvalds 已提交
324 325 326 327 328 329 330 331 332
	if (!node_test_and_set(node, nodes_parsed)) {
		nd->start = start;
		nd->end = end;
	} else {
		if (start < nd->start)
			nd->start = start;
		if (nd->end < end)
			nd->end = end;
	}
333

L
Linus Torvalds 已提交
334 335
	printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
	       nd->start, nd->end);
336 337
	e820_register_active_regions(node, nd->start >> PAGE_SHIFT,
						nd->end >> PAGE_SHIFT);
338 339
	push_node_boundaries(node, nd->start >> PAGE_SHIFT,
						nd->end >> PAGE_SHIFT);
340

341 342
	if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) &&
	    (reserve_hotadd(node, start, end) < 0)) {
343 344 345 346 347 348
		/* Ignore hotadd region. Undo damage */
		printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
		*nd = oldnode;
		if ((nd->start | nd->end) == 0)
			node_clear(node, nodes_parsed);
	}
L
Linus Torvalds 已提交
349 350
}

351 352 353 354 355 356 357 358 359 360 361 362
/* Sanity check to catch more bad SRATs (they are amazingly common).
   Make sure the PXMs cover all memory. */
static int nodes_cover_memory(void)
{
	int i;
	unsigned long pxmram, e820ram;

	pxmram = 0;
	for_each_node_mask(i, nodes_parsed) {
		unsigned long s = nodes[i].start >> PAGE_SHIFT;
		unsigned long e = nodes[i].end >> PAGE_SHIFT;
		pxmram += e - s;
363
		pxmram -= absent_pages_in_range(s, e);
364 365
		if ((long)pxmram < 0)
			pxmram = 0;
366 367
	}

368
	e820ram = end_pfn - absent_pages_in_range(0, end_pfn);
369 370
	/* We seem to lose 3 pages somewhere. Allow a bit of slack. */
	if ((long)(e820ram - pxmram) >= 1*1024*1024) {
371 372 373 374 375 376 377 378 379
		printk(KERN_ERR
	"SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
			(pxmram << PAGE_SHIFT) >> 20,
			(e820ram << PAGE_SHIFT) >> 20);
		return 0;
	}
	return 1;
}

380 381 382 383 384 385 386 387 388 389
static void unparse_node(int node)
{
	int i;
	node_clear(node, nodes_parsed);
	for (i = 0; i < MAX_LOCAL_APIC; i++) {
		if (apicid_to_node[i] == node)
			apicid_to_node[i] = NUMA_NO_NODE;
	}
}

L
Linus Torvalds 已提交
390 391 392 393 394 395
void __init acpi_numa_arch_fixup(void) {}

/* Use the information discovered above to actually set up the nodes. */
int __init acpi_scan_nodes(unsigned long start, unsigned long end)
{
	int i;
396

397
	/* First clean up the node list */
398
	for (i = 0; i < MAX_NUMNODES; i++) {
399
		cutoff_node(i, start, end);
400
		if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
401
			unparse_node(i);
402 403
			node_set_offline(i);
		}
404 405
	}

406 407 408
	if (acpi_numa <= 0)
		return -1;

409 410 411 412 413
	if (!nodes_cover_memory()) {
		bad_srat();
		return -1;
	}

414
	memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
L
Linus Torvalds 已提交
415 416 417 418 419 420
	if (memnode_shift < 0) {
		printk(KERN_ERR
		     "SRAT: No NUMA node hash function found. Contact maintainer\n");
		bad_srat();
		return -1;
	}
421

422 423
	node_possible_map = nodes_parsed;

424
	/* Finally register nodes */
425
	for_each_node_mask(i, node_possible_map)
L
Linus Torvalds 已提交
426
		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
427 428
	/* Try again in case setup_node_bootmem missed one due
	   to missing bootmem */
429
	for_each_node_mask(i, node_possible_map)
430 431 432
		if (!node_online(i))
			setup_node_bootmem(i, nodes[i].start, nodes[i].end);

433
	for (i = 0; i < NR_CPUS; i++) {
L
Linus Torvalds 已提交
434 435
		if (cpu_to_node[i] == NUMA_NO_NODE)
			continue;
436
		if (!node_isset(cpu_to_node[i], node_possible_map))
437
			numa_set_node(i, NUMA_NO_NODE);
L
Linus Torvalds 已提交
438 439 440 441 442
	}
	numa_init_array();
	return 0;
}

443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461
void __init srat_reserve_add_area(int nodeid)
{
	if (found_add_area && nodes_add[nodeid].end) {
		u64 total_mb;

		printk(KERN_INFO "SRAT: Reserving hot-add memory space "
				"for node %d at %Lx-%Lx\n",
			nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
		total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
					>> PAGE_SHIFT;
		total_mb *= sizeof(struct page);
		total_mb >>= 20;
		printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
				"pre-allocated memory.\n", (unsigned long long)total_mb);
		reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
			       nodes_add[nodeid].end - nodes_add[nodeid].start);
	}
}

L
Linus Torvalds 已提交
462 463 464 465 466 467
int __node_distance(int a, int b)
{
	int index;

	if (!acpi_slit)
		return a == b ? 10 : 20;
468
	index = acpi_slit->locality_count * node_to_pxm(a);
L
Linus Torvalds 已提交
469 470 471 472
	return acpi_slit->entry[index + node_to_pxm(b)];
}

EXPORT_SYMBOL(__node_distance);
473 474 475 476 477 478 479 480 481 482 483

int memory_add_physaddr_to_nid(u64 start)
{
	int i, ret = 0;

	for_each_node(i)
		if (nodes_add[i].start <= start && nodes_add[i].end > start)
			ret = i;

	return ret;
}
484 485
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);