srat_64.c 13.1 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 * ACPI 3.0 based NUMA setup
 * Copyright 2004 Andi Kleen, SuSE Labs.
 *
 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
 *
 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
 * Assumes all memory regions belonging to a single proximity domain
 * are in one chunk. Holes between them will be included in the node.
 */

#include <linux/kernel.h>
#include <linux/acpi.h>
#include <linux/mmzone.h>
#include <linux/bitmap.h>
#include <linux/module.h>
#include <linux/topology.h>
18 19
#include <linux/bootmem.h>
#include <linux/mm.h>
L
Linus Torvalds 已提交
20 21
#include <asm/proto.h>
#include <asm/numa.h>
22
#include <asm/e820.h>
L
Linus Torvalds 已提交
23

A
Andi Kleen 已提交
24 25
int acpi_numa __initdata;

L
Linus Torvalds 已提交
26 27 28
static struct acpi_table_slit *acpi_slit;

static nodemask_t nodes_parsed __initdata;
29
static struct bootnode nodes[MAX_NUMNODES] __initdata;
30
static struct bootnode nodes_add[MAX_NUMNODES];
31
static int found_add_area __initdata;
32
int hotadd_percent __initdata = 0;
L
Linus Torvalds 已提交
33

34 35 36 37
/* Too small nodes confuse the VM badly. Usually they result
   from BIOS bugs. */
#define NODE_MIN_SIZE (4*1024*1024)

L
Linus Torvalds 已提交
38 39
static __init int setup_node(int pxm)
{
40
	return acpi_map_pxm_to_node(pxm);
L
Linus Torvalds 已提交
41 42 43 44 45
}

static __init int conflicting_nodes(unsigned long start, unsigned long end)
{
	int i;
46
	for_each_node_mask(i, nodes_parsed) {
47
		struct bootnode *nd = &nodes[i];
L
Linus Torvalds 已提交
48 49 50
		if (nd->start == nd->end)
			continue;
		if (nd->end > start && nd->start < end)
51
			return i;
L
Linus Torvalds 已提交
52
		if (nd->end == end && nd->start == start)
53
			return i;
L
Linus Torvalds 已提交
54 55 56 57 58 59
	}
	return -1;
}

static __init void cutoff_node(int i, unsigned long start, unsigned long end)
{
60
	struct bootnode *nd = &nodes[i];
61 62 63 64

	if (found_add_area)
		return;

L
Linus Torvalds 已提交
65 66 67 68 69 70 71 72 73 74 75 76 77 78
	if (nd->start < start) {
		nd->start = start;
		if (nd->end < nd->start)
			nd->start = nd->end;
	}
	if (nd->end > end) {
		nd->end = end;
		if (nd->start > nd->end)
			nd->start = nd->end;
	}
}

static __init void bad_srat(void)
{
79
	int i;
L
Linus Torvalds 已提交
80 81
	printk(KERN_ERR "SRAT: SRAT not used.\n");
	acpi_numa = -1;
82
	found_add_area = 0;
83 84
	for (i = 0; i < MAX_LOCAL_APIC; i++)
		apicid_to_node[i] = NUMA_NO_NODE;
85 86
	for (i = 0; i < MAX_NUMNODES; i++)
		nodes_add[i].start = nodes[i].end = 0;
87
	remove_all_active_ranges();
L
Linus Torvalds 已提交
88 89 90 91 92 93 94
}

static __init inline int srat_disabled(void)
{
	return numa_off || acpi_numa < 0;
}

A
Andi Kleen 已提交
95 96 97 98 99 100 101 102 103
/*
 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
 * up the NUMA heuristics which wants the local node to have a smaller
 * distance than the others.
 * Do some quick checks here and only use the SLIT if it passes.
 */
static __init int slit_valid(struct acpi_table_slit *slit)
{
	int i, j;
104
	int d = slit->locality_count;
A
Andi Kleen 已提交
105 106 107 108
	for (i = 0; i < d; i++) {
		for (j = 0; j < d; j++)  {
			u8 val = slit->entry[d*i + j];
			if (i == j) {
109
				if (val != LOCAL_DISTANCE)
A
Andi Kleen 已提交
110
					return 0;
111
			} else if (val <= LOCAL_DISTANCE)
A
Andi Kleen 已提交
112 113 114 115 116 117
				return 0;
		}
	}
	return 1;
}

L
Linus Torvalds 已提交
118 119 120
/* Callback for SLIT parsing */
void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
{
A
Andi Kleen 已提交
121 122 123 124
	if (!slit_valid(slit)) {
		printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
		return;
	}
L
Linus Torvalds 已提交
125 126 127 128 129
	acpi_slit = slit;
}

/* Callback for Proximity Domain -> LAPIC mapping */
void __init
130
acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
L
Linus Torvalds 已提交
131 132
{
	int pxm, node;
133 134 135
	int apic_id;

	apic_id = pa->apic_id;
136 137
	if (srat_disabled())
		return;
138
	if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
139
		bad_srat();
140 141
		return;
	}
142
	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
L
Linus Torvalds 已提交
143
		return;
144
	pxm = pa->proximity_domain_lo;
L
Linus Torvalds 已提交
145 146 147 148 149 150
	node = setup_node(pxm);
	if (node < 0) {
		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
		bad_srat();
		return;
	}
151
	apicid_to_node[apic_id] = node;
L
Linus Torvalds 已提交
152
	acpi_numa = 1;
153
	printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
154
	       pxm, apic_id, node);
L
Linus Torvalds 已提交
155 156
}

157
int update_end_of_memory(unsigned long end) {return -1;}
158 159 160 161 162 163
static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
static inline int save_add_info(void) {return 1;}
#else
static inline int save_add_info(void) {return 0;}
#endif
164
/*
165
 * Update nodes_add and decide if to include add are in the zone.
J
Joe Perches 已提交
166
 * Both SPARSE and RESERVE need nodes_add information.
S
Simon Arlott 已提交
167
 * This code supports one contiguous hot add area per node.
168 169 170 171 172
 */
static int reserve_hotadd(int node, unsigned long start, unsigned long end)
{
	unsigned long s_pfn = start >> PAGE_SHIFT;
	unsigned long e_pfn = end >> PAGE_SHIFT;
173
	int ret = 0, changed = 0;
174 175 176 177 178 179 180 181 182 183 184 185 186 187
	struct bootnode *nd = &nodes_add[node];

	/* I had some trouble with strange memory hotadd regions breaking
	   the boot. Be very strict here and reject anything unexpected.
	   If you want working memory hotadd write correct SRATs.

	   The node size check is a basic sanity check to guard against
	   mistakes */
	if ((signed long)(end - start) < NODE_MIN_SIZE) {
		printk(KERN_ERR "SRAT: Hotplug area too small\n");
		return -1;
	}

	/* This check might be a bit too strict, but I'm keeping it for now. */
188
	if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
189 190 191
		printk(KERN_ERR
			"SRAT: Hotplug area %lu -> %lu has existing memory\n",
			s_pfn, e_pfn);
192 193 194 195 196 197 198 199 200 201 202
		return -1;
	}

	if (!hotadd_enough_memory(&nodes_add[node]))  {
		printk(KERN_ERR "SRAT: Hotplug area too large\n");
		return -1;
	}

	/* Looks good */

	if (nd->start == nd->end) {
203 204
		nd->start = start;
		nd->end = end;
205
		changed = 1;
206 207 208
	} else {
		if (nd->start == end) {
			nd->start = start;
209 210
			changed = 1;
		}
211 212
		if (nd->end == start) {
			nd->end = end;
213 214 215 216
			changed = 1;
		}
		if (!changed)
			printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
217
	}
218

219
	ret = update_end_of_memory(nd->end);
220 221 222

	if (changed)
	 	printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
223
	return ret;
224 225
}

L
Linus Torvalds 已提交
226 227
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
void __init
228
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
L
Linus Torvalds 已提交
229
{
230
	struct bootnode *nd, oldnode;
L
Linus Torvalds 已提交
231 232 233 234
	unsigned long start, end;
	int node, pxm;
	int i;

235
	if (srat_disabled())
L
Linus Torvalds 已提交
236
		return;
237
	if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
238 239 240
		bad_srat();
		return;
	}
241
	if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
242
		return;
243 244

	if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
245
		return;
246 247
	start = ma->base_address;
	end = start + ma->length;
L
Linus Torvalds 已提交
248 249 250 251 252 253 254 255
	pxm = ma->proximity_domain;
	node = setup_node(pxm);
	if (node < 0) {
		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
		bad_srat();
		return;
	}
	i = conflicting_nodes(start, end);
256 257 258 259 260
	if (i == node) {
		printk(KERN_WARNING
		"SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
			pxm, start, end, nodes[i].start, nodes[i].end);
	} else if (i >= 0) {
L
Linus Torvalds 已提交
261
		printk(KERN_ERR
262 263 264
		       "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
		       pxm, start, end, node_to_pxm(i),
			nodes[i].start, nodes[i].end);
L
Linus Torvalds 已提交
265 266 267 268
		bad_srat();
		return;
	}
	nd = &nodes[node];
269
	oldnode = *nd;
L
Linus Torvalds 已提交
270 271 272 273 274 275 276 277 278
	if (!node_test_and_set(node, nodes_parsed)) {
		nd->start = start;
		nd->end = end;
	} else {
		if (start < nd->start)
			nd->start = start;
		if (nd->end < end)
			nd->end = end;
	}
279

L
Linus Torvalds 已提交
280 281
	printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
	       nd->start, nd->end);
282 283
	e820_register_active_regions(node, nd->start >> PAGE_SHIFT,
						nd->end >> PAGE_SHIFT);
284 285
	push_node_boundaries(node, nd->start >> PAGE_SHIFT,
						nd->end >> PAGE_SHIFT);
286

287 288
	if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) &&
	    (reserve_hotadd(node, start, end) < 0)) {
289 290 291 292 293 294
		/* Ignore hotadd region. Undo damage */
		printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
		*nd = oldnode;
		if ((nd->start | nd->end) == 0)
			node_clear(node, nodes_parsed);
	}
L
Linus Torvalds 已提交
295 296
}

297 298
/* Sanity check to catch more bad SRATs (they are amazingly common).
   Make sure the PXMs cover all memory. */
299
static int __init nodes_cover_memory(const struct bootnode *nodes)
300 301 302 303 304 305 306 307 308
{
	int i;
	unsigned long pxmram, e820ram;

	pxmram = 0;
	for_each_node_mask(i, nodes_parsed) {
		unsigned long s = nodes[i].start >> PAGE_SHIFT;
		unsigned long e = nodes[i].end >> PAGE_SHIFT;
		pxmram += e - s;
309
		pxmram -= absent_pages_in_range(s, e);
310 311
		if ((long)pxmram < 0)
			pxmram = 0;
312 313
	}

314
	e820ram = end_pfn - absent_pages_in_range(0, end_pfn);
315 316
	/* We seem to lose 3 pages somewhere. Allow a bit of slack. */
	if ((long)(e820ram - pxmram) >= 1*1024*1024) {
317 318 319 320 321 322 323 324 325
		printk(KERN_ERR
	"SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
			(pxmram << PAGE_SHIFT) >> 20,
			(e820ram << PAGE_SHIFT) >> 20);
		return 0;
	}
	return 1;
}

326
static void __init unparse_node(int node)
327 328 329 330 331 332 333 334 335
{
	int i;
	node_clear(node, nodes_parsed);
	for (i = 0; i < MAX_LOCAL_APIC; i++) {
		if (apicid_to_node[i] == node)
			apicid_to_node[i] = NUMA_NO_NODE;
	}
}

L
Linus Torvalds 已提交
336 337 338 339 340 341
void __init acpi_numa_arch_fixup(void) {}

/* Use the information discovered above to actually set up the nodes. */
int __init acpi_scan_nodes(unsigned long start, unsigned long end)
{
	int i;
342

343 344 345
	if (acpi_numa <= 0)
		return -1;

346
	/* First clean up the node list */
347
	for (i = 0; i < MAX_NUMNODES; i++) {
348
		cutoff_node(i, start, end);
349 350 351 352 353 354
		/*
		 * don't confuse VM with a node that doesn't have the
		 * minimum memory.
		 */
		if (nodes[i].end &&
			(nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
355
			unparse_node(i);
356 357
			node_set_offline(i);
		}
358 359
	}

360
	if (!nodes_cover_memory(nodes)) {
361 362 363 364
		bad_srat();
		return -1;
	}

365
	memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
L
Linus Torvalds 已提交
366 367 368 369 370 371
	if (memnode_shift < 0) {
		printk(KERN_ERR
		     "SRAT: No NUMA node hash function found. Contact maintainer\n");
		bad_srat();
		return -1;
	}
372

373 374
	node_possible_map = nodes_parsed;

375
	/* Finally register nodes */
376
	for_each_node_mask(i, node_possible_map)
L
Linus Torvalds 已提交
377
		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
378 379
	/* Try again in case setup_node_bootmem missed one due
	   to missing bootmem */
380
	for_each_node_mask(i, node_possible_map)
381 382 383
		if (!node_online(i))
			setup_node_bootmem(i, nodes[i].start, nodes[i].end);

384
	for (i = 0; i < NR_CPUS; i++) {
M
Mike Travis 已提交
385 386
		int node = early_cpu_to_node(i);

387
		if (node == NUMA_NO_NODE)
L
Linus Torvalds 已提交
388
			continue;
389
		if (!node_isset(node, node_possible_map))
390
			numa_set_node(i, NUMA_NO_NODE);
L
Linus Torvalds 已提交
391 392 393 394 395
	}
	numa_init_array();
	return 0;
}

396
#ifdef CONFIG_NUMA_EMU
397 398 399
static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
	[0 ... MAX_NUMNODES-1] = PXM_INVAL
};
400
static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
401 402
	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};
403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418
static int __init find_node_by_addr(unsigned long addr)
{
	int ret = NUMA_NO_NODE;
	int i;

	for_each_node_mask(i, nodes_parsed) {
		/*
		 * Find the real node that this emulated node appears on.  For
		 * the sake of simplicity, we only use a real node's starting
		 * address to determine which emulated node it appears on.
		 */
		if (addr >= nodes[i].start && addr < nodes[i].end) {
			ret = i;
			break;
		}
	}
M
Minoru Usui 已提交
419
	return ret;
420 421 422 423 424 425 426 427 428 429 430 431
}

/*
 * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
 * mappings that respect the real ACPI topology but reflect our emulated
 * environment.  For each emulated node, we find which real node it appears on
 * and create PXM to NID mappings for those fake nodes which mirror that
 * locality.  SLIT will now represent the correct distances between emulated
 * nodes as a result of the real topology.
 */
void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
{
432
	int i, j;
433 434 435 436 437 438 439 440 441 442 443 444 445

	printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
			 "topology.\n");
	for (i = 0; i < num_nodes; i++) {
		int nid, pxm;

		nid = find_node_by_addr(fake_nodes[i].start);
		if (nid == NUMA_NO_NODE)
			continue;
		pxm = node_to_pxm(nid);
		if (pxm == PXM_INVAL)
			continue;
		fake_node_to_pxm_map[i] = pxm;
446 447 448 449 450 451 452
		/*
		 * For each apicid_to_node mapping that exists for this real
		 * node, it must now point to the fake node ID.
		 */
		for (j = 0; j < MAX_LOCAL_APIC; j++)
			if (apicid_to_node[j] == nid)
				fake_apicid_to_node[j] = i;
453 454 455
	}
	for (i = 0; i < num_nodes; i++)
		__acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
456
	memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475

	nodes_clear(nodes_parsed);
	for (i = 0; i < num_nodes; i++)
		if (fake_nodes[i].start != fake_nodes[i].end)
			node_set(i, nodes_parsed);
	WARN_ON(!nodes_cover_memory(fake_nodes));
}

static int null_slit_node_compare(int a, int b)
{
	return node_to_pxm(a) == node_to_pxm(b);
}
#else
static int null_slit_node_compare(int a, int b)
{
	return a == b;
}
#endif /* CONFIG_NUMA_EMU */

476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494
void __init srat_reserve_add_area(int nodeid)
{
	if (found_add_area && nodes_add[nodeid].end) {
		u64 total_mb;

		printk(KERN_INFO "SRAT: Reserving hot-add memory space "
				"for node %d at %Lx-%Lx\n",
			nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
		total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
					>> PAGE_SHIFT;
		total_mb *= sizeof(struct page);
		total_mb >>= 20;
		printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
				"pre-allocated memory.\n", (unsigned long long)total_mb);
		reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
			       nodes_add[nodeid].end - nodes_add[nodeid].start);
	}
}

L
Linus Torvalds 已提交
495 496 497 498 499
int __node_distance(int a, int b)
{
	int index;

	if (!acpi_slit)
500 501
		return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
						      REMOTE_DISTANCE;
502
	index = acpi_slit->locality_count * node_to_pxm(a);
L
Linus Torvalds 已提交
503 504 505 506
	return acpi_slit->entry[index + node_to_pxm(b)];
}

EXPORT_SYMBOL(__node_distance);
507 508 509 510 511 512 513 514 515 516 517

int memory_add_physaddr_to_nid(u64 start)
{
	int i, ret = 0;

	for_each_node(i)
		if (nodes_add[i].start <= start && nodes_add[i].end > start)
			ret = i;

	return ret;
}
518 519
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);