srat_64.c 13.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 * ACPI 3.0 based NUMA setup
 * Copyright 2004 Andi Kleen, SuSE Labs.
 *
 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
 *
 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
 * Assumes all memory regions belonging to a single proximity domain
 * are in one chunk. Holes between them will be included in the node.
 */

#include <linux/kernel.h>
#include <linux/acpi.h>
#include <linux/mmzone.h>
#include <linux/bitmap.h>
#include <linux/module.h>
#include <linux/topology.h>
18 19
#include <linux/bootmem.h>
#include <linux/mm.h>
L
Linus Torvalds 已提交
20 21
#include <asm/proto.h>
#include <asm/numa.h>
22
#include <asm/e820.h>
J
Jack Steiner 已提交
23
#include <asm/genapic.h>
L
Linus Torvalds 已提交
24

A
Andi Kleen 已提交
25 26
int acpi_numa __initdata;

L
Linus Torvalds 已提交
27 28 29
static struct acpi_table_slit *acpi_slit;

static nodemask_t nodes_parsed __initdata;
30
static struct bootnode nodes[MAX_NUMNODES] __initdata;
31
static struct bootnode nodes_add[MAX_NUMNODES];
32
static int found_add_area __initdata;
33
int hotadd_percent __initdata = 0;
L
Linus Torvalds 已提交
34

35 36 37 38
static int num_node_memblks __initdata;
static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;

39 40 41 42
/* Too small nodes confuse the VM badly. Usually they result
   from BIOS bugs. */
#define NODE_MIN_SIZE (4*1024*1024)

L
Linus Torvalds 已提交
43 44
static __init int setup_node(int pxm)
{
45
	return acpi_map_pxm_to_node(pxm);
L
Linus Torvalds 已提交
46 47
}

48
static __init int conflicting_memblks(unsigned long start, unsigned long end)
L
Linus Torvalds 已提交
49 50
{
	int i;
51 52
	for (i = 0; i < num_node_memblks; i++) {
		struct bootnode *nd = &node_memblk_range[i];
L
Linus Torvalds 已提交
53 54 55
		if (nd->start == nd->end)
			continue;
		if (nd->end > start && nd->start < end)
56
			return memblk_nodeid[i];
L
Linus Torvalds 已提交
57
		if (nd->end == end && nd->start == start)
58
			return memblk_nodeid[i];
L
Linus Torvalds 已提交
59 60 61 62 63 64
	}
	return -1;
}

static __init void cutoff_node(int i, unsigned long start, unsigned long end)
{
65
	struct bootnode *nd = &nodes[i];
66 67 68 69

	if (found_add_area)
		return;

L
Linus Torvalds 已提交
70 71 72 73 74 75 76 77 78 79 80 81 82 83
	if (nd->start < start) {
		nd->start = start;
		if (nd->end < nd->start)
			nd->start = nd->end;
	}
	if (nd->end > end) {
		nd->end = end;
		if (nd->start > nd->end)
			nd->start = nd->end;
	}
}

static __init void bad_srat(void)
{
84
	int i;
L
Linus Torvalds 已提交
85 86
	printk(KERN_ERR "SRAT: SRAT not used.\n");
	acpi_numa = -1;
87
	found_add_area = 0;
88 89
	for (i = 0; i < MAX_LOCAL_APIC; i++)
		apicid_to_node[i] = NUMA_NO_NODE;
90 91
	for (i = 0; i < MAX_NUMNODES; i++)
		nodes_add[i].start = nodes[i].end = 0;
92
	remove_all_active_ranges();
L
Linus Torvalds 已提交
93 94 95 96 97 98 99
}

static __init inline int srat_disabled(void)
{
	return numa_off || acpi_numa < 0;
}

A
Andi Kleen 已提交
100 101 102 103 104 105 106 107 108
/*
 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
 * up the NUMA heuristics which wants the local node to have a smaller
 * distance than the others.
 * Do some quick checks here and only use the SLIT if it passes.
 */
static __init int slit_valid(struct acpi_table_slit *slit)
{
	int i, j;
109
	int d = slit->locality_count;
A
Andi Kleen 已提交
110 111 112 113
	for (i = 0; i < d; i++) {
		for (j = 0; j < d; j++)  {
			u8 val = slit->entry[d*i + j];
			if (i == j) {
114
				if (val != LOCAL_DISTANCE)
A
Andi Kleen 已提交
115
					return 0;
116
			} else if (val <= LOCAL_DISTANCE)
A
Andi Kleen 已提交
117 118 119 120 121 122
				return 0;
		}
	}
	return 1;
}

L
Linus Torvalds 已提交
123 124 125
/* Callback for SLIT parsing */
void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
{
A
Andi Kleen 已提交
126 127 128 129
	if (!slit_valid(slit)) {
		printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
		return;
	}
L
Linus Torvalds 已提交
130 131 132 133 134
	acpi_slit = slit;
}

/* Callback for Proximity Domain -> LAPIC mapping */
void __init
135
acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
L
Linus Torvalds 已提交
136 137
{
	int pxm, node;
138 139
	int apic_id;

140 141
	if (srat_disabled())
		return;
142
	if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
143
		bad_srat();
144 145
		return;
	}
146
	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
L
Linus Torvalds 已提交
147
		return;
148
	pxm = pa->proximity_domain_lo;
L
Linus Torvalds 已提交
149 150 151 152 153 154
	node = setup_node(pxm);
	if (node < 0) {
		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
		bad_srat();
		return;
	}
155

J
Jack Steiner 已提交
156 157 158 159
	if (is_uv_system())
		apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
	else
		apic_id = pa->apic_id;
160
	apicid_to_node[apic_id] = node;
L
Linus Torvalds 已提交
161
	acpi_numa = 1;
162
	printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
163
	       pxm, apic_id, node);
L
Linus Torvalds 已提交
164 165
}

I
Ingo Molnar 已提交
166
static int update_end_of_memory(unsigned long end) {return -1;}
167 168 169 170 171 172
static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
static inline int save_add_info(void) {return 1;}
#else
static inline int save_add_info(void) {return 0;}
#endif
173
/*
174
 * Update nodes_add and decide if to include add are in the zone.
J
Joe Perches 已提交
175
 * Both SPARSE and RESERVE need nodes_add information.
S
Simon Arlott 已提交
176
 * This code supports one contiguous hot add area per node.
177
 */
178 179
static int __init
reserve_hotadd(int node, unsigned long start, unsigned long end)
180 181 182
{
	unsigned long s_pfn = start >> PAGE_SHIFT;
	unsigned long e_pfn = end >> PAGE_SHIFT;
183
	int ret = 0, changed = 0;
184 185 186 187 188 189 190 191 192 193 194 195 196 197
	struct bootnode *nd = &nodes_add[node];

	/* I had some trouble with strange memory hotadd regions breaking
	   the boot. Be very strict here and reject anything unexpected.
	   If you want working memory hotadd write correct SRATs.

	   The node size check is a basic sanity check to guard against
	   mistakes */
	if ((signed long)(end - start) < NODE_MIN_SIZE) {
		printk(KERN_ERR "SRAT: Hotplug area too small\n");
		return -1;
	}

	/* This check might be a bit too strict, but I'm keeping it for now. */
198
	if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
199 200 201
		printk(KERN_ERR
			"SRAT: Hotplug area %lu -> %lu has existing memory\n",
			s_pfn, e_pfn);
202 203 204 205 206 207 208 209 210 211 212
		return -1;
	}

	if (!hotadd_enough_memory(&nodes_add[node]))  {
		printk(KERN_ERR "SRAT: Hotplug area too large\n");
		return -1;
	}

	/* Looks good */

	if (nd->start == nd->end) {
213 214
		nd->start = start;
		nd->end = end;
215
		changed = 1;
216 217 218
	} else {
		if (nd->start == end) {
			nd->start = start;
219 220
			changed = 1;
		}
221 222
		if (nd->end == start) {
			nd->end = end;
223 224 225 226
			changed = 1;
		}
		if (!changed)
			printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
227
	}
228

229
	ret = update_end_of_memory(nd->end);
230 231 232

	if (changed)
	 	printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
233
	return ret;
234 235
}

L
Linus Torvalds 已提交
236 237
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
void __init
238
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
L
Linus Torvalds 已提交
239
{
240
	struct bootnode *nd, oldnode;
L
Linus Torvalds 已提交
241 242 243 244
	unsigned long start, end;
	int node, pxm;
	int i;

245
	if (srat_disabled())
L
Linus Torvalds 已提交
246
		return;
247
	if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
248 249 250
		bad_srat();
		return;
	}
251
	if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
252
		return;
253 254

	if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
255
		return;
256 257
	start = ma->base_address;
	end = start + ma->length;
L
Linus Torvalds 已提交
258 259 260 261 262 263 264
	pxm = ma->proximity_domain;
	node = setup_node(pxm);
	if (node < 0) {
		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
		bad_srat();
		return;
	}
265
	i = conflicting_memblks(start, end);
266 267 268 269 270
	if (i == node) {
		printk(KERN_WARNING
		"SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
			pxm, start, end, nodes[i].start, nodes[i].end);
	} else if (i >= 0) {
L
Linus Torvalds 已提交
271
		printk(KERN_ERR
272 273 274
		       "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
		       pxm, start, end, node_to_pxm(i),
			nodes[i].start, nodes[i].end);
L
Linus Torvalds 已提交
275 276 277 278
		bad_srat();
		return;
	}
	nd = &nodes[node];
279
	oldnode = *nd;
L
Linus Torvalds 已提交
280 281 282 283 284 285 286 287 288
	if (!node_test_and_set(node, nodes_parsed)) {
		nd->start = start;
		nd->end = end;
	} else {
		if (start < nd->start)
			nd->start = start;
		if (nd->end < end)
			nd->end = end;
	}
289

290 291 292 293
	printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
	       start, end);
	e820_register_active_regions(node, start >> PAGE_SHIFT,
				     end >> PAGE_SHIFT);
294 295
	push_node_boundaries(node, nd->start >> PAGE_SHIFT,
						nd->end >> PAGE_SHIFT);
296

297 298
	if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) &&
	    (reserve_hotadd(node, start, end) < 0)) {
299 300 301 302 303 304
		/* Ignore hotadd region. Undo damage */
		printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
		*nd = oldnode;
		if ((nd->start | nd->end) == 0)
			node_clear(node, nodes_parsed);
	}
305 306 307 308 309

	node_memblk_range[num_node_memblks].start = start;
	node_memblk_range[num_node_memblks].end = end;
	memblk_nodeid[num_node_memblks] = node;
	num_node_memblks++;
L
Linus Torvalds 已提交
310 311
}

312 313
/* Sanity check to catch more bad SRATs (they are amazingly common).
   Make sure the PXMs cover all memory. */
314
static int __init nodes_cover_memory(const struct bootnode *nodes)
315 316 317 318 319 320 321 322 323
{
	int i;
	unsigned long pxmram, e820ram;

	pxmram = 0;
	for_each_node_mask(i, nodes_parsed) {
		unsigned long s = nodes[i].start >> PAGE_SHIFT;
		unsigned long e = nodes[i].end >> PAGE_SHIFT;
		pxmram += e - s;
324
		pxmram -= absent_pages_in_range(s, e);
325 326
		if ((long)pxmram < 0)
			pxmram = 0;
327 328
	}

329
	e820ram = end_pfn - absent_pages_in_range(0, end_pfn);
330 331
	/* We seem to lose 3 pages somewhere. Allow a bit of slack. */
	if ((long)(e820ram - pxmram) >= 1*1024*1024) {
332 333 334 335 336 337 338 339 340
		printk(KERN_ERR
	"SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
			(pxmram << PAGE_SHIFT) >> 20,
			(e820ram << PAGE_SHIFT) >> 20);
		return 0;
	}
	return 1;
}

341
static void __init unparse_node(int node)
342 343 344 345 346 347 348 349 350
{
	int i;
	node_clear(node, nodes_parsed);
	for (i = 0; i < MAX_LOCAL_APIC; i++) {
		if (apicid_to_node[i] == node)
			apicid_to_node[i] = NUMA_NO_NODE;
	}
}

L
Linus Torvalds 已提交
351 352 353 354 355 356
void __init acpi_numa_arch_fixup(void) {}

/* Use the information discovered above to actually set up the nodes. */
int __init acpi_scan_nodes(unsigned long start, unsigned long end)
{
	int i;
357

358 359 360
	if (acpi_numa <= 0)
		return -1;

361
	/* First clean up the node list */
362
	for (i = 0; i < MAX_NUMNODES; i++) {
363
		cutoff_node(i, start, end);
364 365 366 367 368 369
		/*
		 * don't confuse VM with a node that doesn't have the
		 * minimum memory.
		 */
		if (nodes[i].end &&
			(nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
370
			unparse_node(i);
371 372
			node_set_offline(i);
		}
373 374
	}

375
	if (!nodes_cover_memory(nodes)) {
376 377 378 379
		bad_srat();
		return -1;
	}

380 381
	memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
					   memblk_nodeid);
L
Linus Torvalds 已提交
382 383 384 385 386 387
	if (memnode_shift < 0) {
		printk(KERN_ERR
		     "SRAT: No NUMA node hash function found. Contact maintainer\n");
		bad_srat();
		return -1;
	}
388

389 390
	node_possible_map = nodes_parsed;

391
	/* Finally register nodes */
392
	for_each_node_mask(i, node_possible_map)
L
Linus Torvalds 已提交
393
		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
394 395
	/* Try again in case setup_node_bootmem missed one due
	   to missing bootmem */
396
	for_each_node_mask(i, node_possible_map)
397 398 399
		if (!node_online(i))
			setup_node_bootmem(i, nodes[i].start, nodes[i].end);

400
	for (i = 0; i < NR_CPUS; i++) {
M
Mike Travis 已提交
401 402
		int node = early_cpu_to_node(i);

403
		if (node == NUMA_NO_NODE)
L
Linus Torvalds 已提交
404
			continue;
405
		if (!node_isset(node, node_possible_map))
406
			numa_set_node(i, NUMA_NO_NODE);
L
Linus Torvalds 已提交
407 408 409 410 411
	}
	numa_init_array();
	return 0;
}

412
#ifdef CONFIG_NUMA_EMU
413 414 415
static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
	[0 ... MAX_NUMNODES-1] = PXM_INVAL
};
416
static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
417 418
	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};
419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434
static int __init find_node_by_addr(unsigned long addr)
{
	int ret = NUMA_NO_NODE;
	int i;

	for_each_node_mask(i, nodes_parsed) {
		/*
		 * Find the real node that this emulated node appears on.  For
		 * the sake of simplicity, we only use a real node's starting
		 * address to determine which emulated node it appears on.
		 */
		if (addr >= nodes[i].start && addr < nodes[i].end) {
			ret = i;
			break;
		}
	}
M
Minoru Usui 已提交
435
	return ret;
436 437 438 439 440 441 442 443 444 445 446 447
}

/*
 * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
 * mappings that respect the real ACPI topology but reflect our emulated
 * environment.  For each emulated node, we find which real node it appears on
 * and create PXM to NID mappings for those fake nodes which mirror that
 * locality.  SLIT will now represent the correct distances between emulated
 * nodes as a result of the real topology.
 */
void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
{
448
	int i, j;
449 450 451 452 453 454 455 456 457 458 459 460 461

	printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
			 "topology.\n");
	for (i = 0; i < num_nodes; i++) {
		int nid, pxm;

		nid = find_node_by_addr(fake_nodes[i].start);
		if (nid == NUMA_NO_NODE)
			continue;
		pxm = node_to_pxm(nid);
		if (pxm == PXM_INVAL)
			continue;
		fake_node_to_pxm_map[i] = pxm;
462 463 464 465 466 467 468
		/*
		 * For each apicid_to_node mapping that exists for this real
		 * node, it must now point to the fake node ID.
		 */
		for (j = 0; j < MAX_LOCAL_APIC; j++)
			if (apicid_to_node[j] == nid)
				fake_apicid_to_node[j] = i;
469 470 471
	}
	for (i = 0; i < num_nodes; i++)
		__acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
472
	memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491

	nodes_clear(nodes_parsed);
	for (i = 0; i < num_nodes; i++)
		if (fake_nodes[i].start != fake_nodes[i].end)
			node_set(i, nodes_parsed);
	WARN_ON(!nodes_cover_memory(fake_nodes));
}

static int null_slit_node_compare(int a, int b)
{
	return node_to_pxm(a) == node_to_pxm(b);
}
#else
static int null_slit_node_compare(int a, int b)
{
	return a == b;
}
#endif /* CONFIG_NUMA_EMU */

492 493 494 495 496 497 498 499 500 501 502 503 504 505 506
void __init srat_reserve_add_area(int nodeid)
{
	if (found_add_area && nodes_add[nodeid].end) {
		u64 total_mb;

		printk(KERN_INFO "SRAT: Reserving hot-add memory space "
				"for node %d at %Lx-%Lx\n",
			nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
		total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
					>> PAGE_SHIFT;
		total_mb *= sizeof(struct page);
		total_mb >>= 20;
		printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
				"pre-allocated memory.\n", (unsigned long long)total_mb);
		reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
507 508
			       nodes_add[nodeid].end - nodes_add[nodeid].start,
			       BOOTMEM_DEFAULT);
509 510 511
	}
}

L
Linus Torvalds 已提交
512 513 514 515 516
int __node_distance(int a, int b)
{
	int index;

	if (!acpi_slit)
517 518
		return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
						      REMOTE_DISTANCE;
519
	index = acpi_slit->locality_count * node_to_pxm(a);
L
Linus Torvalds 已提交
520 521 522 523
	return acpi_slit->entry[index + node_to_pxm(b)];
}

EXPORT_SYMBOL(__node_distance);
524 525 526 527 528 529 530 531 532 533 534

int memory_add_physaddr_to_nid(u64 start)
{
	int i, ret = 0;

	for_each_node(i)
		if (nodes_add[i].start <= start && nodes_add[i].end > start)
			ret = i;

	return ret;
}
535 536
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);