srat_64.c 12.4 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 * ACPI 3.0 based NUMA setup
 * Copyright 2004 Andi Kleen, SuSE Labs.
 *
 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
 *
 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
 * Assumes all memory regions belonging to a single proximity domain
 * are in one chunk. Holes between them will be included in the node.
 */

#include <linux/kernel.h>
#include <linux/acpi.h>
#include <linux/mmzone.h>
#include <linux/bitmap.h>
#include <linux/module.h>
#include <linux/topology.h>
18 19
#include <linux/bootmem.h>
#include <linux/mm.h>
L
Linus Torvalds 已提交
20 21
#include <asm/proto.h>
#include <asm/numa.h>
22
#include <asm/e820.h>
I
Ingo Molnar 已提交
23
#include <asm/apic.h>
I
Ingo Molnar 已提交
24
#include <asm/uv/uv.h>
L
Linus Torvalds 已提交
25

A
Andi Kleen 已提交
26 27
int acpi_numa __initdata;

L
Linus Torvalds 已提交
28 29 30
static struct acpi_table_slit *acpi_slit;

static nodemask_t nodes_parsed __initdata;
31
static nodemask_t cpu_nodes_parsed __initdata;
32
static struct bootnode nodes[MAX_NUMNODES] __initdata;
33
static struct bootnode nodes_add[MAX_NUMNODES];
L
Linus Torvalds 已提交
34

35 36 37 38
static int num_node_memblks __initdata;
static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;

L
Linus Torvalds 已提交
39 40
static __init int setup_node(int pxm)
{
41
	return acpi_map_pxm_to_node(pxm);
L
Linus Torvalds 已提交
42 43
}

44
static __init int conflicting_memblks(unsigned long start, unsigned long end)
L
Linus Torvalds 已提交
45 46
{
	int i;
47 48
	for (i = 0; i < num_node_memblks; i++) {
		struct bootnode *nd = &node_memblk_range[i];
L
Linus Torvalds 已提交
49 50 51
		if (nd->start == nd->end)
			continue;
		if (nd->end > start && nd->start < end)
52
			return memblk_nodeid[i];
L
Linus Torvalds 已提交
53
		if (nd->end == end && nd->start == start)
54
			return memblk_nodeid[i];
L
Linus Torvalds 已提交
55 56 57 58 59 60
	}
	return -1;
}

static __init void cutoff_node(int i, unsigned long start, unsigned long end)
{
61
	struct bootnode *nd = &nodes[i];
62

L
Linus Torvalds 已提交
63 64 65 66 67 68 69 70 71 72 73 74 75 76
	if (nd->start < start) {
		nd->start = start;
		if (nd->end < nd->start)
			nd->start = nd->end;
	}
	if (nd->end > end) {
		nd->end = end;
		if (nd->start > nd->end)
			nd->start = nd->end;
	}
}

static __init void bad_srat(void)
{
77
	int i;
L
Linus Torvalds 已提交
78 79
	printk(KERN_ERR "SRAT: SRAT not used.\n");
	acpi_numa = -1;
80 81
	for (i = 0; i < MAX_LOCAL_APIC; i++)
		apicid_to_node[i] = NUMA_NO_NODE;
82 83 84 85
	for (i = 0; i < MAX_NUMNODES; i++) {
		nodes[i].start = nodes[i].end = 0;
		nodes_add[i].start = nodes_add[i].end = 0;
	}
86
	remove_all_active_ranges();
L
Linus Torvalds 已提交
87 88 89 90 91 92 93 94 95 96
}

static __init inline int srat_disabled(void)
{
	return numa_off || acpi_numa < 0;
}

/* Callback for SLIT parsing */
void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
{
Y
Yinghai Lu 已提交
97 98 99 100 101 102 103 104 105 106 107 108 109
	unsigned length;
	unsigned long phys;

	length = slit->header.length;
	phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length,
		 PAGE_SIZE);

	if (phys == -1L)
		panic(" Can not save slit!\n");

	acpi_slit = __va(phys);
	memcpy(acpi_slit, slit, length);
	reserve_early(phys, phys + length, "ACPI SLIT");
L
Linus Torvalds 已提交
110 111
}

112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
/* Callback for Proximity Domain -> x2APIC mapping */
void __init
acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
{
	int pxm, node;
	int apic_id;

	if (srat_disabled())
		return;
	if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
		bad_srat();
		return;
	}
	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
		return;
	pxm = pa->proximity_domain;
	node = setup_node(pxm);
	if (node < 0) {
		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
		bad_srat();
		return;
	}

	apic_id = pa->apic_id;
	apicid_to_node[apic_id] = node;
137
	node_set(node, cpu_nodes_parsed);
138 139 140 141 142
	acpi_numa = 1;
	printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
	       pxm, apic_id, node);
}

L
Linus Torvalds 已提交
143 144
/* Callback for Proximity Domain -> LAPIC mapping */
void __init
145
acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
L
Linus Torvalds 已提交
146 147
{
	int pxm, node;
148 149
	int apic_id;

150 151
	if (srat_disabled())
		return;
152
	if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
153
		bad_srat();
154 155
		return;
	}
156
	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
L
Linus Torvalds 已提交
157
		return;
158
	pxm = pa->proximity_domain_lo;
L
Linus Torvalds 已提交
159 160 161 162 163 164
	node = setup_node(pxm);
	if (node < 0) {
		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
		bad_srat();
		return;
	}
165

166
	if (get_uv_system_type() >= UV_X2APIC)
J
Jack Steiner 已提交
167 168 169
		apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
	else
		apic_id = pa->apic_id;
170
	apicid_to_node[apic_id] = node;
171
	node_set(node, cpu_nodes_parsed);
L
Linus Torvalds 已提交
172
	acpi_numa = 1;
173
	printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
174
	       pxm, apic_id, node);
L
Linus Torvalds 已提交
175 176
}

177 178 179 180 181
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
static inline int save_add_info(void) {return 1;}
#else
static inline int save_add_info(void) {return 0;}
#endif
182
/*
183 184
 * Update nodes_add[]
 * This code supports one contiguous hot add area per node
185
 */
186 187
static void __init
update_nodes_add(int node, unsigned long start, unsigned long end)
188 189 190
{
	unsigned long s_pfn = start >> PAGE_SHIFT;
	unsigned long e_pfn = end >> PAGE_SHIFT;
191
	int changed = 0;
192 193 194 195 196 197 198 199 200 201
	struct bootnode *nd = &nodes_add[node];

	/* I had some trouble with strange memory hotadd regions breaking
	   the boot. Be very strict here and reject anything unexpected.
	   If you want working memory hotadd write correct SRATs.

	   The node size check is a basic sanity check to guard against
	   mistakes */
	if ((signed long)(end - start) < NODE_MIN_SIZE) {
		printk(KERN_ERR "SRAT: Hotplug area too small\n");
202
		return;
203 204 205
	}

	/* This check might be a bit too strict, but I'm keeping it for now. */
206
	if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
207 208 209
		printk(KERN_ERR
			"SRAT: Hotplug area %lu -> %lu has existing memory\n",
			s_pfn, e_pfn);
210
		return;
211 212 213 214 215
	}

	/* Looks good */

	if (nd->start == nd->end) {
216 217
		nd->start = start;
		nd->end = end;
218
		changed = 1;
219 220 221
	} else {
		if (nd->start == end) {
			nd->start = start;
222 223
			changed = 1;
		}
224 225
		if (nd->end == start) {
			nd->end = end;
226 227 228 229
			changed = 1;
		}
		if (!changed)
			printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
230
	}
231 232

	if (changed)
233 234
		printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
				 nd->start, nd->end);
235 236
}

L
Linus Torvalds 已提交
237 238
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
void __init
239
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
L
Linus Torvalds 已提交
240
{
241
	struct bootnode *nd, oldnode;
L
Linus Torvalds 已提交
242 243 244 245
	unsigned long start, end;
	int node, pxm;
	int i;

246
	if (srat_disabled())
L
Linus Torvalds 已提交
247
		return;
248
	if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
249 250 251
		bad_srat();
		return;
	}
252
	if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
253
		return;
254 255

	if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
256
		return;
257 258
	start = ma->base_address;
	end = start + ma->length;
L
Linus Torvalds 已提交
259 260 261 262 263 264 265
	pxm = ma->proximity_domain;
	node = setup_node(pxm);
	if (node < 0) {
		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
		bad_srat();
		return;
	}
266
	i = conflicting_memblks(start, end);
267 268 269 270 271
	if (i == node) {
		printk(KERN_WARNING
		"SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
			pxm, start, end, nodes[i].start, nodes[i].end);
	} else if (i >= 0) {
L
Linus Torvalds 已提交
272
		printk(KERN_ERR
273 274 275
		       "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
		       pxm, start, end, node_to_pxm(i),
			nodes[i].start, nodes[i].end);
L
Linus Torvalds 已提交
276 277 278 279
		bad_srat();
		return;
	}
	nd = &nodes[node];
280
	oldnode = *nd;
L
Linus Torvalds 已提交
281 282 283 284 285 286 287 288 289
	if (!node_test_and_set(node, nodes_parsed)) {
		nd->start = start;
		nd->end = end;
	} else {
		if (start < nd->start)
			nd->start = start;
		if (nd->end < end)
			nd->end = end;
	}
290

291 292 293 294
	printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
	       start, end);
	e820_register_active_regions(node, start >> PAGE_SHIFT,
				     end >> PAGE_SHIFT);
295

296 297 298
	if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
		update_nodes_add(node, start, end);
		/* restore nodes[node] */
299 300 301 302
		*nd = oldnode;
		if ((nd->start | nd->end) == 0)
			node_clear(node, nodes_parsed);
	}
303 304 305 306 307

	node_memblk_range[num_node_memblks].start = start;
	node_memblk_range[num_node_memblks].end = end;
	memblk_nodeid[num_node_memblks] = node;
	num_node_memblks++;
L
Linus Torvalds 已提交
308 309
}

310 311
/* Sanity check to catch more bad SRATs (they are amazingly common).
   Make sure the PXMs cover all memory. */
312
static int __init nodes_cover_memory(const struct bootnode *nodes)
313 314 315 316 317 318 319 320 321
{
	int i;
	unsigned long pxmram, e820ram;

	pxmram = 0;
	for_each_node_mask(i, nodes_parsed) {
		unsigned long s = nodes[i].start >> PAGE_SHIFT;
		unsigned long e = nodes[i].end >> PAGE_SHIFT;
		pxmram += e - s;
322
		pxmram -= absent_pages_in_range(s, e);
323 324
		if ((long)pxmram < 0)
			pxmram = 0;
325 326
	}

327
	e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
328 329
	/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
	if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
330 331 332 333 334 335 336 337 338
		printk(KERN_ERR
	"SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
			(pxmram << PAGE_SHIFT) >> 20,
			(e820ram << PAGE_SHIFT) >> 20);
		return 0;
	}
	return 1;
}

L
Linus Torvalds 已提交
339 340 341 342 343 344
void __init acpi_numa_arch_fixup(void) {}

/* Use the information discovered above to actually set up the nodes. */
int __init acpi_scan_nodes(unsigned long start, unsigned long end)
{
	int i;
345

346 347 348
	if (acpi_numa <= 0)
		return -1;

349
	/* First clean up the node list */
350
	for (i = 0; i < MAX_NUMNODES; i++)
351
		cutoff_node(i, start, end);
352

353
	if (!nodes_cover_memory(nodes)) {
354 355 356 357
		bad_srat();
		return -1;
	}

358 359
	memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
					   memblk_nodeid);
L
Linus Torvalds 已提交
360 361 362 363 364 365
	if (memnode_shift < 0) {
		printk(KERN_ERR
		     "SRAT: No NUMA node hash function found. Contact maintainer\n");
		bad_srat();
		return -1;
	}
366

367 368
	/* Account for nodes with cpus and no memory */
	nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
369

370
	/* Finally register nodes */
371
	for_each_node_mask(i, node_possible_map)
L
Linus Torvalds 已提交
372
		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
373 374
	/* Try again in case setup_node_bootmem missed one due
	   to missing bootmem */
375
	for_each_node_mask(i, node_possible_map)
376 377 378
		if (!node_online(i))
			setup_node_bootmem(i, nodes[i].start, nodes[i].end);

379
	for (i = 0; i < nr_cpu_ids; i++) {
M
Mike Travis 已提交
380 381
		int node = early_cpu_to_node(i);

382
		if (node == NUMA_NO_NODE)
L
Linus Torvalds 已提交
383
			continue;
384
		if (!node_online(node))
385
			numa_clear_node(i);
L
Linus Torvalds 已提交
386 387 388 389 390
	}
	numa_init_array();
	return 0;
}

391
#ifdef CONFIG_NUMA_EMU
392 393 394
static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
	[0 ... MAX_NUMNODES-1] = PXM_INVAL
};
395
static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
396 397
	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};
398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413
static int __init find_node_by_addr(unsigned long addr)
{
	int ret = NUMA_NO_NODE;
	int i;

	for_each_node_mask(i, nodes_parsed) {
		/*
		 * Find the real node that this emulated node appears on.  For
		 * the sake of simplicity, we only use a real node's starting
		 * address to determine which emulated node it appears on.
		 */
		if (addr >= nodes[i].start && addr < nodes[i].end) {
			ret = i;
			break;
		}
	}
M
Minoru Usui 已提交
414
	return ret;
415 416 417 418 419 420 421 422 423 424 425 426
}

/*
 * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
 * mappings that respect the real ACPI topology but reflect our emulated
 * environment.  For each emulated node, we find which real node it appears on
 * and create PXM to NID mappings for those fake nodes which mirror that
 * locality.  SLIT will now represent the correct distances between emulated
 * nodes as a result of the real topology.
 */
void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
{
427
	int i, j;
428 429 430 431 432 433 434 435 436 437 438 439 440

	printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
			 "topology.\n");
	for (i = 0; i < num_nodes; i++) {
		int nid, pxm;

		nid = find_node_by_addr(fake_nodes[i].start);
		if (nid == NUMA_NO_NODE)
			continue;
		pxm = node_to_pxm(nid);
		if (pxm == PXM_INVAL)
			continue;
		fake_node_to_pxm_map[i] = pxm;
441 442 443 444 445 446 447
		/*
		 * For each apicid_to_node mapping that exists for this real
		 * node, it must now point to the fake node ID.
		 */
		for (j = 0; j < MAX_LOCAL_APIC; j++)
			if (apicid_to_node[j] == nid)
				fake_apicid_to_node[j] = i;
448 449 450
	}
	for (i = 0; i < num_nodes; i++)
		__acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
451
	memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470

	nodes_clear(nodes_parsed);
	for (i = 0; i < num_nodes; i++)
		if (fake_nodes[i].start != fake_nodes[i].end)
			node_set(i, nodes_parsed);
	WARN_ON(!nodes_cover_memory(fake_nodes));
}

static int null_slit_node_compare(int a, int b)
{
	return node_to_pxm(a) == node_to_pxm(b);
}
#else
static int null_slit_node_compare(int a, int b)
{
	return a == b;
}
#endif /* CONFIG_NUMA_EMU */

L
Linus Torvalds 已提交
471 472 473 474 475
int __node_distance(int a, int b)
{
	int index;

	if (!acpi_slit)
476 477
		return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
						      REMOTE_DISTANCE;
478
	index = acpi_slit->locality_count * node_to_pxm(a);
L
Linus Torvalds 已提交
479 480 481 482
	return acpi_slit->entry[index + node_to_pxm(b)];
}

EXPORT_SYMBOL(__node_distance);
483

484
#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
485 486 487 488 489 490 491 492 493 494
int memory_add_physaddr_to_nid(u64 start)
{
	int i, ret = 0;

	for_each_node(i)
		if (nodes_add[i].start <= start && nodes_add[i].end > start)
			ret = i;

	return ret;
}
495
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
496
#endif