srat_64.c 13.0 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 * ACPI 3.0 based NUMA setup
 * Copyright 2004 Andi Kleen, SuSE Labs.
 *
 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
 *
 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
 * Assumes all memory regions belonging to a single proximity domain
 * are in one chunk. Holes between them will be included in the node.
 */

#include <linux/kernel.h>
#include <linux/acpi.h>
#include <linux/mmzone.h>
#include <linux/bitmap.h>
#include <linux/module.h>
#include <linux/topology.h>
18 19
#include <linux/bootmem.h>
#include <linux/mm.h>
L
Linus Torvalds 已提交
20 21
#include <asm/proto.h>
#include <asm/numa.h>
22
#include <asm/e820.h>
J
Jack Steiner 已提交
23
#include <asm/genapic.h>
L
Linus Torvalds 已提交
24

A
Andi Kleen 已提交
25 26
int acpi_numa __initdata;

L
Linus Torvalds 已提交
27 28 29
static struct acpi_table_slit *acpi_slit;

static nodemask_t nodes_parsed __initdata;
30
static struct bootnode nodes[MAX_NUMNODES] __initdata;
31
static struct bootnode nodes_add[MAX_NUMNODES];
32
static int found_add_area __initdata;
33
int hotadd_percent __initdata = 0;
L
Linus Torvalds 已提交
34

35 36 37 38
static int num_node_memblks __initdata;
static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;

39 40 41 42
/* Too small nodes confuse the VM badly. Usually they result
   from BIOS bugs. */
#define NODE_MIN_SIZE (4*1024*1024)

L
Linus Torvalds 已提交
43 44
static __init int setup_node(int pxm)
{
45
	return acpi_map_pxm_to_node(pxm);
L
Linus Torvalds 已提交
46 47
}

48
static __init int conflicting_memblks(unsigned long start, unsigned long end)
L
Linus Torvalds 已提交
49 50
{
	int i;
51 52
	for (i = 0; i < num_node_memblks; i++) {
		struct bootnode *nd = &node_memblk_range[i];
L
Linus Torvalds 已提交
53 54 55
		if (nd->start == nd->end)
			continue;
		if (nd->end > start && nd->start < end)
56
			return memblk_nodeid[i];
L
Linus Torvalds 已提交
57
		if (nd->end == end && nd->start == start)
58
			return memblk_nodeid[i];
L
Linus Torvalds 已提交
59 60 61 62 63 64
	}
	return -1;
}

static __init void cutoff_node(int i, unsigned long start, unsigned long end)
{
65
	struct bootnode *nd = &nodes[i];
66 67 68 69

	if (found_add_area)
		return;

L
Linus Torvalds 已提交
70 71 72 73 74 75 76 77 78 79 80 81 82 83
	if (nd->start < start) {
		nd->start = start;
		if (nd->end < nd->start)
			nd->start = nd->end;
	}
	if (nd->end > end) {
		nd->end = end;
		if (nd->start > nd->end)
			nd->start = nd->end;
	}
}

static __init void bad_srat(void)
{
84
	int i;
L
Linus Torvalds 已提交
85 86
	printk(KERN_ERR "SRAT: SRAT not used.\n");
	acpi_numa = -1;
87
	found_add_area = 0;
88 89
	for (i = 0; i < MAX_LOCAL_APIC; i++)
		apicid_to_node[i] = NUMA_NO_NODE;
90 91
	for (i = 0; i < MAX_NUMNODES; i++)
		nodes_add[i].start = nodes[i].end = 0;
92
	remove_all_active_ranges();
L
Linus Torvalds 已提交
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
}

static __init inline int srat_disabled(void)
{
	return numa_off || acpi_numa < 0;
}

/* Callback for SLIT parsing */
void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
{
	acpi_slit = slit;
}

/* Callback for Proximity Domain -> LAPIC mapping */
void __init
108
acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
L
Linus Torvalds 已提交
109 110
{
	int pxm, node;
111 112
	int apic_id;

113 114
	if (srat_disabled())
		return;
115
	if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
116
		bad_srat();
117 118
		return;
	}
119
	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
L
Linus Torvalds 已提交
120
		return;
121
	pxm = pa->proximity_domain_lo;
L
Linus Torvalds 已提交
122 123 124 125 126 127
	node = setup_node(pxm);
	if (node < 0) {
		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
		bad_srat();
		return;
	}
128

J
Jack Steiner 已提交
129 130 131 132
	if (is_uv_system())
		apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
	else
		apic_id = pa->apic_id;
133
	apicid_to_node[apic_id] = node;
L
Linus Torvalds 已提交
134
	acpi_numa = 1;
135
	printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
136
	       pxm, apic_id, node);
L
Linus Torvalds 已提交
137 138
}

I
Ingo Molnar 已提交
139
static int update_end_of_memory(unsigned long end) {return -1;}
140 141 142 143 144 145
static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
static inline int save_add_info(void) {return 1;}
#else
static inline int save_add_info(void) {return 0;}
#endif
146
/*
147
 * Update nodes_add and decide if to include add are in the zone.
J
Joe Perches 已提交
148
 * Both SPARSE and RESERVE need nodes_add information.
S
Simon Arlott 已提交
149
 * This code supports one contiguous hot add area per node.
150
 */
151 152
static int __init
reserve_hotadd(int node, unsigned long start, unsigned long end)
153 154 155
{
	unsigned long s_pfn = start >> PAGE_SHIFT;
	unsigned long e_pfn = end >> PAGE_SHIFT;
156
	int ret = 0, changed = 0;
157 158 159 160 161 162 163 164 165 166 167 168 169 170
	struct bootnode *nd = &nodes_add[node];

	/* I had some trouble with strange memory hotadd regions breaking
	   the boot. Be very strict here and reject anything unexpected.
	   If you want working memory hotadd write correct SRATs.

	   The node size check is a basic sanity check to guard against
	   mistakes */
	if ((signed long)(end - start) < NODE_MIN_SIZE) {
		printk(KERN_ERR "SRAT: Hotplug area too small\n");
		return -1;
	}

	/* This check might be a bit too strict, but I'm keeping it for now. */
171
	if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
172 173 174
		printk(KERN_ERR
			"SRAT: Hotplug area %lu -> %lu has existing memory\n",
			s_pfn, e_pfn);
175 176 177 178 179 180 181 182 183 184 185
		return -1;
	}

	if (!hotadd_enough_memory(&nodes_add[node]))  {
		printk(KERN_ERR "SRAT: Hotplug area too large\n");
		return -1;
	}

	/* Looks good */

	if (nd->start == nd->end) {
186 187
		nd->start = start;
		nd->end = end;
188
		changed = 1;
189 190 191
	} else {
		if (nd->start == end) {
			nd->start = start;
192 193
			changed = 1;
		}
194 195
		if (nd->end == start) {
			nd->end = end;
196 197 198 199
			changed = 1;
		}
		if (!changed)
			printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
200
	}
201

202
	ret = update_end_of_memory(nd->end);
203 204 205

	if (changed)
	 	printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
206
	return ret;
207 208
}

L
Linus Torvalds 已提交
209 210
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
void __init
211
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
L
Linus Torvalds 已提交
212
{
213
	struct bootnode *nd, oldnode;
L
Linus Torvalds 已提交
214 215 216 217
	unsigned long start, end;
	int node, pxm;
	int i;

218
	if (srat_disabled())
L
Linus Torvalds 已提交
219
		return;
220
	if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
221 222 223
		bad_srat();
		return;
	}
224
	if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
225
		return;
226 227

	if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
228
		return;
229 230
	start = ma->base_address;
	end = start + ma->length;
L
Linus Torvalds 已提交
231 232 233 234 235 236 237
	pxm = ma->proximity_domain;
	node = setup_node(pxm);
	if (node < 0) {
		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
		bad_srat();
		return;
	}
238
	i = conflicting_memblks(start, end);
239 240 241 242 243
	if (i == node) {
		printk(KERN_WARNING
		"SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
			pxm, start, end, nodes[i].start, nodes[i].end);
	} else if (i >= 0) {
L
Linus Torvalds 已提交
244
		printk(KERN_ERR
245 246 247
		       "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
		       pxm, start, end, node_to_pxm(i),
			nodes[i].start, nodes[i].end);
L
Linus Torvalds 已提交
248 249 250 251
		bad_srat();
		return;
	}
	nd = &nodes[node];
252
	oldnode = *nd;
L
Linus Torvalds 已提交
253 254 255 256 257 258 259 260 261
	if (!node_test_and_set(node, nodes_parsed)) {
		nd->start = start;
		nd->end = end;
	} else {
		if (start < nd->start)
			nd->start = start;
		if (nd->end < end)
			nd->end = end;
	}
262

263 264 265 266
	printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
	       start, end);
	e820_register_active_regions(node, start >> PAGE_SHIFT,
				     end >> PAGE_SHIFT);
267 268
	push_node_boundaries(node, nd->start >> PAGE_SHIFT,
						nd->end >> PAGE_SHIFT);
269

270 271
	if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) &&
	    (reserve_hotadd(node, start, end) < 0)) {
272 273 274 275 276 277
		/* Ignore hotadd region. Undo damage */
		printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
		*nd = oldnode;
		if ((nd->start | nd->end) == 0)
			node_clear(node, nodes_parsed);
	}
278 279 280 281 282

	node_memblk_range[num_node_memblks].start = start;
	node_memblk_range[num_node_memblks].end = end;
	memblk_nodeid[num_node_memblks] = node;
	num_node_memblks++;
L
Linus Torvalds 已提交
283 284
}

285 286
/* Sanity check to catch more bad SRATs (they are amazingly common).
   Make sure the PXMs cover all memory. */
287
static int __init nodes_cover_memory(const struct bootnode *nodes)
288 289 290 291 292 293 294 295 296
{
	int i;
	unsigned long pxmram, e820ram;

	pxmram = 0;
	for_each_node_mask(i, nodes_parsed) {
		unsigned long s = nodes[i].start >> PAGE_SHIFT;
		unsigned long e = nodes[i].end >> PAGE_SHIFT;
		pxmram += e - s;
297
		pxmram -= absent_pages_in_range(s, e);
298 299
		if ((long)pxmram < 0)
			pxmram = 0;
300 301
	}

Y
Yinghai Lu 已提交
302
	e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
303 304
	/* We seem to lose 3 pages somewhere. Allow a bit of slack. */
	if ((long)(e820ram - pxmram) >= 1*1024*1024) {
305 306 307 308 309 310 311 312 313
		printk(KERN_ERR
	"SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
			(pxmram << PAGE_SHIFT) >> 20,
			(e820ram << PAGE_SHIFT) >> 20);
		return 0;
	}
	return 1;
}

314
static void __init unparse_node(int node)
315 316 317 318 319 320 321 322 323
{
	int i;
	node_clear(node, nodes_parsed);
	for (i = 0; i < MAX_LOCAL_APIC; i++) {
		if (apicid_to_node[i] == node)
			apicid_to_node[i] = NUMA_NO_NODE;
	}
}

L
Linus Torvalds 已提交
324 325 326 327 328 329
void __init acpi_numa_arch_fixup(void) {}

/* Use the information discovered above to actually set up the nodes. */
int __init acpi_scan_nodes(unsigned long start, unsigned long end)
{
	int i;
330

331 332 333
	if (acpi_numa <= 0)
		return -1;

334
	/* First clean up the node list */
335
	for (i = 0; i < MAX_NUMNODES; i++) {
336
		cutoff_node(i, start, end);
337 338 339 340 341 342
		/*
		 * don't confuse VM with a node that doesn't have the
		 * minimum memory.
		 */
		if (nodes[i].end &&
			(nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
343
			unparse_node(i);
344 345
			node_set_offline(i);
		}
346 347
	}

348
	if (!nodes_cover_memory(nodes)) {
349 350 351 352
		bad_srat();
		return -1;
	}

353 354
	memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
					   memblk_nodeid);
L
Linus Torvalds 已提交
355 356 357 358 359 360
	if (memnode_shift < 0) {
		printk(KERN_ERR
		     "SRAT: No NUMA node hash function found. Contact maintainer\n");
		bad_srat();
		return -1;
	}
361

362 363
	node_possible_map = nodes_parsed;

364
	/* Finally register nodes */
365
	for_each_node_mask(i, node_possible_map)
L
Linus Torvalds 已提交
366
		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
367 368
	/* Try again in case setup_node_bootmem missed one due
	   to missing bootmem */
369
	for_each_node_mask(i, node_possible_map)
370 371 372
		if (!node_online(i))
			setup_node_bootmem(i, nodes[i].start, nodes[i].end);

373
	for (i = 0; i < NR_CPUS; i++) {
M
Mike Travis 已提交
374 375
		int node = early_cpu_to_node(i);

376
		if (node == NUMA_NO_NODE)
L
Linus Torvalds 已提交
377
			continue;
378
		if (!node_isset(node, node_possible_map))
379
			numa_clear_node(i);
L
Linus Torvalds 已提交
380 381 382 383 384
	}
	numa_init_array();
	return 0;
}

385
#ifdef CONFIG_NUMA_EMU
386 387 388
static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
	[0 ... MAX_NUMNODES-1] = PXM_INVAL
};
389
static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
390 391
	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};
392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407
static int __init find_node_by_addr(unsigned long addr)
{
	int ret = NUMA_NO_NODE;
	int i;

	for_each_node_mask(i, nodes_parsed) {
		/*
		 * Find the real node that this emulated node appears on.  For
		 * the sake of simplicity, we only use a real node's starting
		 * address to determine which emulated node it appears on.
		 */
		if (addr >= nodes[i].start && addr < nodes[i].end) {
			ret = i;
			break;
		}
	}
M
Minoru Usui 已提交
408
	return ret;
409 410 411 412 413 414 415 416 417 418 419 420
}

/*
 * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
 * mappings that respect the real ACPI topology but reflect our emulated
 * environment.  For each emulated node, we find which real node it appears on
 * and create PXM to NID mappings for those fake nodes which mirror that
 * locality.  SLIT will now represent the correct distances between emulated
 * nodes as a result of the real topology.
 */
void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
{
421
	int i, j;
422 423 424 425 426 427 428 429 430 431 432 433 434

	printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
			 "topology.\n");
	for (i = 0; i < num_nodes; i++) {
		int nid, pxm;

		nid = find_node_by_addr(fake_nodes[i].start);
		if (nid == NUMA_NO_NODE)
			continue;
		pxm = node_to_pxm(nid);
		if (pxm == PXM_INVAL)
			continue;
		fake_node_to_pxm_map[i] = pxm;
435 436 437 438 439 440 441
		/*
		 * For each apicid_to_node mapping that exists for this real
		 * node, it must now point to the fake node ID.
		 */
		for (j = 0; j < MAX_LOCAL_APIC; j++)
			if (apicid_to_node[j] == nid)
				fake_apicid_to_node[j] = i;
442 443 444
	}
	for (i = 0; i < num_nodes; i++)
		__acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
445
	memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464

	nodes_clear(nodes_parsed);
	for (i = 0; i < num_nodes; i++)
		if (fake_nodes[i].start != fake_nodes[i].end)
			node_set(i, nodes_parsed);
	WARN_ON(!nodes_cover_memory(fake_nodes));
}

static int null_slit_node_compare(int a, int b)
{
	return node_to_pxm(a) == node_to_pxm(b);
}
#else
static int null_slit_node_compare(int a, int b)
{
	return a == b;
}
#endif /* CONFIG_NUMA_EMU */

465 466 467 468 469 470 471 472 473 474 475 476 477 478 479
void __init srat_reserve_add_area(int nodeid)
{
	if (found_add_area && nodes_add[nodeid].end) {
		u64 total_mb;

		printk(KERN_INFO "SRAT: Reserving hot-add memory space "
				"for node %d at %Lx-%Lx\n",
			nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
		total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
					>> PAGE_SHIFT;
		total_mb *= sizeof(struct page);
		total_mb >>= 20;
		printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
				"pre-allocated memory.\n", (unsigned long long)total_mb);
		reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
480 481
			       nodes_add[nodeid].end - nodes_add[nodeid].start,
			       BOOTMEM_DEFAULT);
482 483 484
	}
}

L
Linus Torvalds 已提交
485 486 487 488 489
int __node_distance(int a, int b)
{
	int index;

	if (!acpi_slit)
490 491
		return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
						      REMOTE_DISTANCE;
492
	index = acpi_slit->locality_count * node_to_pxm(a);
L
Linus Torvalds 已提交
493 494 495 496
	return acpi_slit->entry[index + node_to_pxm(b)];
}

EXPORT_SYMBOL(__node_distance);
497

498
#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
499 500 501 502 503 504 505 506 507 508
int memory_add_physaddr_to_nid(u64 start)
{
	int i, ret = 0;

	for_each_node(i)
		if (nodes_add[i].start <= start && nodes_add[i].end > start)
			ret = i;

	return ret;
}
509
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
510
#endif