numa_64.c 19.2 KB
Newer Older
T
Thomas Gleixner 已提交
1
/*
L
Linus Torvalds 已提交
2 3
 * Generic VM initialization for x86-64 NUMA setups.
 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
T
Thomas Gleixner 已提交
4
 */
L
Linus Torvalds 已提交
5 6 7 8 9 10 11 12 13
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/mmzone.h>
#include <linux/ctype.h>
#include <linux/module.h>
#include <linux/nodemask.h>
14
#include <linux/sched.h>
L
Linus Torvalds 已提交
15 16 17 18 19 20

#include <asm/e820.h>
#include <asm/proto.h>
#include <asm/dma.h>
#include <asm/numa.h>
#include <asm/acpi.h>
21
#include <asm/k8.h>
L
Linus Torvalds 已提交
22

23
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
T
Thomas Gleixner 已提交
24 25
EXPORT_SYMBOL(node_data);

26
struct memnode memnode;
L
Linus Torvalds 已提交
27

28
s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
T
Thomas Gleixner 已提交
29
	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
30
};
T
Thomas Gleixner 已提交
31

L
Linus Torvalds 已提交
32
int numa_off __initdata;
33 34
static unsigned long __initdata nodemap_addr;
static unsigned long __initdata nodemap_size;
L
Linus Torvalds 已提交
35

B
Brian Gerst 已提交
36 37 38 39 40 41 42 43 44
DEFINE_PER_CPU(int, node_number) = 0;
EXPORT_PER_CPU_SYMBOL(node_number);

/*
 * Map cpu index to node index
 */
DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);

45 46 47 48 49 50 51
/*
 * Given a shift value, try to populate memnodemap[]
 * Returns :
 * 1 if OK
 * 0 if memnodmap[] too small (of shift too small)
 * -1 if node overlap or lost ram (shift too big)
 */
T
Thomas Gleixner 已提交
52
static int __init populate_memnodemap(const struct bootnode *nodes,
53
				      int numnodes, int shift, int *nodeids)
L
Linus Torvalds 已提交
54
{
55
	unsigned long addr, end;
T
Thomas Gleixner 已提交
56
	int i, res = -1;
57

58
	memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
59
	for (i = 0; i < numnodes; i++) {
60 61 62
		addr = nodes[i].start;
		end = nodes[i].end;
		if (addr >= end)
63
			continue;
64
		if ((end >> shift) >= memnodemapsize)
65 66
			return 0;
		do {
67
			if (memnodemap[addr >> shift] != NUMA_NO_NODE)
68
				return -1;
69 70 71 72 73 74

			if (!nodeids)
				memnodemap[addr >> shift] = i;
			else
				memnodemap[addr >> shift] = nodeids[i];

75
			addr += (1UL << shift);
76 77
		} while (addr < end);
		res = 1;
T
Thomas Gleixner 已提交
78
	}
79 80 81
	return res;
}

82 83
static int __init allocate_cachealigned_memnodemap(void)
{
84
	unsigned long addr;
85 86

	memnodemap = memnode.embedded_map;
87
	if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
88 89
		return 0;

90
	addr = 0x8000;
91
	nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
Y
Yinghai Lu 已提交
92
	nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
93
				      nodemap_size, L1_CACHE_BYTES);
94 95 96 97 98 99
	if (nodemap_addr == -1UL) {
		printk(KERN_ERR
		       "NUMA: Unable to allocate Memory to Node hash map\n");
		nodemap_addr = nodemap_size = 0;
		return -1;
	}
100
	memnodemap = phys_to_virt(nodemap_addr);
101
	reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
102 103 104 105 106 107 108 109 110 111

	printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
	       nodemap_addr, nodemap_addr + nodemap_size);
	return 0;
}

/*
 * The LSB of all start and end addresses in the node map is the value of the
 * maximum possible shift.
 */
T
Thomas Gleixner 已提交
112 113
static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
					 int numnodes)
114
{
115
	int i, nodes_used = 0;
116 117 118 119 120 121 122 123
	unsigned long start, end;
	unsigned long bitfield = 0, memtop = 0;

	for (i = 0; i < numnodes; i++) {
		start = nodes[i].start;
		end = nodes[i].end;
		if (start >= end)
			continue;
124 125
		bitfield |= start;
		nodes_used++;
126 127 128
		if (end > memtop)
			memtop = end;
	}
129 130 131 132
	if (nodes_used <= 1)
		i = 63;
	else
		i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
133 134 135
	memnodemapsize = (memtop >> i)+1;
	return i;
}
136

137 138
int __init compute_hash_shift(struct bootnode *nodes, int numnodes,
			      int *nodeids)
139 140
{
	int shift;
141

142 143 144
	shift = extract_lsb_from_nodes(nodes, numnodes);
	if (allocate_cachealigned_memnodemap())
		return -1;
145
	printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
146 147
		shift);

148
	if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) {
T
Thomas Gleixner 已提交
149 150 151
		printk(KERN_INFO "Your memory is not aligned you need to "
		       "rebuild your kernel with a bigger NODEMAPSIZE "
		       "shift=%d\n", shift);
152 153
		return -1;
	}
154
	return shift;
L
Linus Torvalds 已提交
155 156
}

157
int __meminit  __early_pfn_to_nid(unsigned long pfn)
158 159 160 161
{
	return phys_to_nid(pfn << PAGE_SHIFT);
}

T
Thomas Gleixner 已提交
162
static void * __init early_node_mem(int nodeid, unsigned long start,
163 164
				    unsigned long end, unsigned long size,
				    unsigned long align)
165
{
166
	unsigned long mem = find_e820_area(start, end, size, align);
167
	void *ptr;
T
Thomas Gleixner 已提交
168

Y
Yinghai Lu 已提交
169
	if (mem != -1L)
170
		return __va(mem);
Y
Yinghai Lu 已提交
171

172
	ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
Y
Yoann Padioleau 已提交
173
	if (ptr == NULL) {
174
		printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
T
Thomas Gleixner 已提交
175
		       size, nodeid);
176 177 178 179 180
		return NULL;
	}
	return ptr;
}

L
Linus Torvalds 已提交
181
/* Initialize bootmem allocator for a node */
T
Thomas Gleixner 已提交
182 183 184
void __init setup_node_bootmem(int nodeid, unsigned long start,
			       unsigned long end)
{
185
	unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
T
Thomas Gleixner 已提交
186
	unsigned long bootmap_start, nodedata_phys;
187
	void *bootmap;
188
	const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
189
	int nid;
L
Linus Torvalds 已提交
190

191
	start = roundup(start, ZONE_ALIGN);
L
Linus Torvalds 已提交
192

T
Thomas Gleixner 已提交
193 194
	printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
	       start, end);
L
Linus Torvalds 已提交
195 196

	start_pfn = start >> PAGE_SHIFT;
197
	last_pfn = end >> PAGE_SHIFT;
L
Linus Torvalds 已提交
198

199 200
	node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
					   SMP_CACHE_BYTES);
201 202 203
	if (node_data[nodeid] == NULL)
		return;
	nodedata_phys = __pa(node_data[nodeid]);
204 205
	printk(KERN_INFO "  NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
		nodedata_phys + pgdat_size - 1);
L
Linus Torvalds 已提交
206 207

	memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
208
	NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
L
Linus Torvalds 已提交
209
	NODE_DATA(nodeid)->node_start_pfn = start_pfn;
210
	NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
L
Linus Torvalds 已提交
211

212 213 214 215 216 217 218
	/*
	 * Find a place for the bootmem map
	 * nodedata_phys could be on other nodes by alloc_bootmem,
	 * so need to sure bootmap_start not to be small, otherwise
	 * early_node_mem will get that with find_e820_area instead
	 * of alloc_bootmem, that could clash with reserved range
	 */
219
	bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
220 221
	nid = phys_to_nid(nodedata_phys);
	if (nid == nodeid)
222
		bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
223
	else
224
		bootmap_start = roundup(start, PAGE_SIZE);
225
	/*
226
	 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
227 228
	 * to use that to align to PAGE_SIZE
	 */
229
	bootmap = early_node_mem(nodeid, bootmap_start, end,
230
				 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
231 232
	if (bootmap == NULL)  {
		if (nodedata_phys < start || nodedata_phys >= end)
233
			free_bootmem(nodedata_phys, pgdat_size);
234 235 236 237
		node_data[nodeid] = NULL;
		return;
	}
	bootmap_start = __pa(bootmap);
T
Thomas Gleixner 已提交
238

L
Linus Torvalds 已提交
239
	bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
T
Thomas Gleixner 已提交
240
					 bootmap_start >> PAGE_SHIFT,
241
					 start_pfn, last_pfn);
L
Linus Torvalds 已提交
242

243 244 245 246
	printk(KERN_INFO "  bootmap [%016lx -  %016lx] pages %lx\n",
		 bootmap_start, bootmap_start + bootmap_size - 1,
		 bootmap_pages);

247
	free_bootmem_with_active_regions(nodeid, end);
L
Linus Torvalds 已提交
248

249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
	/*
	 * convert early reserve to bootmem reserve earlier
	 * otherwise early_node_mem could use early reserved mem
	 * on previous node
	 */
	early_res_to_bootmem(start, end);

	/*
	 * in some case early_node_mem could use alloc_bootmem
	 * to get range on other node, don't reserve that again
	 */
	if (nid != nodeid)
		printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nodeid, nid);
	else
		reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys,
					pgdat_size, BOOTMEM_DEFAULT);
	nid = phys_to_nid(bootmap_start);
	if (nid != nodeid)
		printk(KERN_INFO "    bootmap(%d) on node %d\n", nodeid, nid);
	else
		reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
				 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);

272 273 274
#ifdef CONFIG_ACPI_NUMA
	srat_reserve_add_area(nodeid);
#endif
L
Linus Torvalds 已提交
275
	node_set_online(nodeid);
T
Thomas Gleixner 已提交
276
}
L
Linus Torvalds 已提交
277

T
Thomas Gleixner 已提交
278 279 280 281 282 283 284
/*
 * There are unfortunately some poorly designed mainboards around that
 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
 * mapping. To avoid this fill in the mapping for all possible CPUs,
 * as the number of CPUs is not known yet. We round robin the existing
 * nodes.
 */
L
Linus Torvalds 已提交
285 286 287
void __init numa_init_array(void)
{
	int rr, i;
T
Thomas Gleixner 已提交
288

289
	rr = first_node(node_online_map);
290
	for (i = 0; i < nr_cpu_ids; i++) {
291
		if (early_cpu_to_node(i) != NUMA_NO_NODE)
L
Linus Torvalds 已提交
292
			continue;
T
Thomas Gleixner 已提交
293
		numa_set_node(i, rr);
L
Linus Torvalds 已提交
294 295 296 297 298 299 300
		rr = next_node(rr, node_online_map);
		if (rr == MAX_NUMNODES)
			rr = first_node(node_online_map);
	}
}

#ifdef CONFIG_NUMA_EMU
301
/* Numa emulation */
302
static char *cmdline __initdata;
L
Linus Torvalds 已提交
303

304
/*
T
Thomas Gleixner 已提交
305 306 307 308 309
 * Setups up nid to range from addr to addr + size.  If the end
 * boundary is greater than max_addr, then max_addr is used instead.
 * The return value is 0 if there is additional memory left for
 * allocation past addr and -1 otherwise.  addr is adjusted to be at
 * the end of the node.
310
 */
311 312
static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
				   u64 size, u64 max_addr)
313
{
314
	int ret = 0;
T
Thomas Gleixner 已提交
315

316 317 318 319 320 321 322
	nodes[nid].start = *addr;
	*addr += size;
	if (*addr >= max_addr) {
		*addr = max_addr;
		ret = -1;
	}
	nodes[nid].end = *addr;
323
	node_set(nid, node_possible_map);
324 325 326 327
	printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
	       nodes[nid].start, nodes[nid].end,
	       (nodes[nid].end - nodes[nid].start) >> 20);
	return ret;
328 329
}

330 331 332 333 334 335 336 337
/*
 * Splits num_nodes nodes up equally starting at node_start.  The return value
 * is the number of nodes split up and addr is adjusted to be at the end of the
 * last node allocated.
 */
static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
				      u64 max_addr, int node_start,
				      int num_nodes)
L
Linus Torvalds 已提交
338
{
339 340 341
	unsigned int big;
	u64 size;
	int i;
342

343 344 345 346
	if (num_nodes <= 0)
		return -1;
	if (num_nodes > MAX_NUMNODES)
		num_nodes = MAX_NUMNODES;
347
	size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) /
348
	       num_nodes;
349
	/*
350 351
	 * Calculate the number of big nodes that can be allocated as a result
	 * of consolidating the leftovers.
352
	 */
353 354 355 356 357 358 359 360 361
	big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
	      FAKE_NODE_MIN_SIZE;

	/* Round down to nearest FAKE_NODE_MIN_SIZE. */
	size &= FAKE_NODE_MIN_HASH_MASK;
	if (!size) {
		printk(KERN_ERR "Not enough memory for each node.  "
		       "NUMA emulation disabled.\n");
		return -1;
362
	}
363 364 365

	for (i = node_start; i < num_nodes + node_start; i++) {
		u64 end = *addr + size;
T
Thomas Gleixner 已提交
366

367 368 369
		if (i < big)
			end += FAKE_NODE_MIN_SIZE;
		/*
370 371
		 * The final node can have the remaining system RAM.  Other
		 * nodes receive roughly the same amount of available pages.
372
		 */
373 374 375
		if (i == num_nodes + node_start - 1)
			end = max_addr;
		else
376
			while (end - *addr - e820_hole_size(*addr, end) <
377 378 379 380 381 382 383 384 385 386 387 388 389
			       size) {
				end += FAKE_NODE_MIN_SIZE;
				if (end > max_addr) {
					end = max_addr;
					break;
				}
			}
		if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
			break;
	}
	return i - node_start + 1;
}

390 391 392 393 394 395 396 397 398 399 400 401 402 403 404
/*
 * Splits the remaining system RAM into chunks of size.  The remaining memory is
 * always assigned to a final node and can be asymmetric.  Returns the number of
 * nodes split.
 */
static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
				      u64 max_addr, int node_start, u64 size)
{
	int i = node_start;
	size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
	while (!setup_node_range(i++, nodes, addr, size, max_addr))
		;
	return i - node_start;
}

405
/*
406
 * Sets up the system RAM area from start_pfn to last_pfn according to the
407 408
 * numa=fake command-line option.
 */
409 410
static struct bootnode nodes[MAX_NUMNODES] __initdata;

411
static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
412
{
T
Thomas Gleixner 已提交
413
	u64 size, addr = start_pfn << PAGE_SHIFT;
414
	u64 max_addr = last_pfn << PAGE_SHIFT;
T
Thomas Gleixner 已提交
415
	int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
416 417 418 419 420 421 422

	memset(&nodes, 0, sizeof(nodes));
	/*
	 * If the numa=fake command-line is just a single number N, split the
	 * system RAM into N fake nodes.
	 */
	if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
T
Thomas Gleixner 已提交
423 424 425
		long n = simple_strtol(cmdline, NULL, 0);

		num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
426 427 428 429 430 431
		if (num_nodes < 0)
			return num_nodes;
		goto out;
	}

	/* Parse the command line. */
432
	for (coeff_flag = 0; ; cmdline++) {
433 434 435
		if (*cmdline && isdigit(*cmdline)) {
			num = num * 10 + *cmdline - '0';
			continue;
436
		}
437 438 439 440 441
		if (*cmdline == '*') {
			if (num > 0)
				coeff = num;
			coeff_flag = 1;
		}
442
		if (!*cmdline || *cmdline == ',') {
443 444
			if (!coeff_flag)
				coeff = 1;
445 446 447 448 449
			/*
			 * Round down to the nearest FAKE_NODE_MIN_SIZE.
			 * Command-line coefficients are in megabytes.
			 */
			size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
450
			if (size)
451 452 453 454
				for (i = 0; i < coeff; i++, num_nodes++)
					if (setup_node_range(num_nodes, nodes,
						&addr, size, max_addr) < 0)
						goto done;
455 456 457 458
			if (!*cmdline)
				break;
			coeff_flag = 0;
			coeff = -1;
459
		}
460 461 462 463 464
		num = 0;
	}
done:
	if (!num_nodes)
		return -1;
465
	/* Fill remainder of system RAM, if appropriate. */
466
	if (addr < max_addr) {
467 468 469 470 471 472
		if (coeff_flag && coeff < 0) {
			/* Split remaining nodes into num-sized chunks */
			num_nodes += split_nodes_by_size(nodes, &addr, max_addr,
							 num_nodes, num);
			goto out;
		}
473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489
		switch (*(cmdline - 1)) {
		case '*':
			/* Split remaining nodes into coeff chunks */
			if (coeff <= 0)
				break;
			num_nodes += split_nodes_equally(nodes, &addr, max_addr,
							 num_nodes, coeff);
			break;
		case ',':
			/* Do not allocate remaining system RAM */
			break;
		default:
			/* Give one final node */
			setup_node_range(num_nodes, nodes, &addr,
					 max_addr - addr, max_addr);
			num_nodes++;
		}
490 491
	}
out:
492
	memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
493 494 495 496 497 498 499 500 501
	if (memnode_shift < 0) {
		memnode_shift = 0;
		printk(KERN_ERR "No NUMA hash function found.  NUMA emulation "
		       "disabled.\n");
		return -1;
	}

	/*
	 * We need to vacate all active ranges that may have been registered by
502 503
	 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
	 * true.  NUMA emulation has succeeded so we will not scan ACPI nodes.
504 505
	 */
	remove_all_active_ranges();
506 507 508
#ifdef CONFIG_ACPI_NUMA
	acpi_numa = -1;
#endif
509
	for_each_node_mask(i, node_possible_map) {
510 511
		e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
						nodes[i].end >> PAGE_SHIFT);
T
Thomas Gleixner 已提交
512
		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
513
	}
514
	acpi_fake_nodes(nodes, num_nodes);
T
Thomas Gleixner 已提交
515 516
	numa_init_array();
	return 0;
L
Linus Torvalds 已提交
517
}
518
#endif /* CONFIG_NUMA_EMU */
L
Linus Torvalds 已提交
519

520
void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
T
Thomas Gleixner 已提交
521
{
L
Linus Torvalds 已提交
522 523
	int i;

524
	nodes_clear(node_possible_map);
525
	nodes_clear(node_online_map);
526

L
Linus Torvalds 已提交
527
#ifdef CONFIG_NUMA_EMU
528
	if (cmdline && !numa_emulation(start_pfn, last_pfn))
T
Thomas Gleixner 已提交
529
		return;
530
	nodes_clear(node_possible_map);
531
	nodes_clear(node_online_map);
L
Linus Torvalds 已提交
532 533 534 535
#endif

#ifdef CONFIG_ACPI_NUMA
	if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
536
					  last_pfn << PAGE_SHIFT))
T
Thomas Gleixner 已提交
537
		return;
538
	nodes_clear(node_possible_map);
539
	nodes_clear(node_online_map);
L
Linus Torvalds 已提交
540 541 542
#endif

#ifdef CONFIG_K8_NUMA
T
Thomas Gleixner 已提交
543
	if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
544
					last_pfn<<PAGE_SHIFT))
L
Linus Torvalds 已提交
545
		return;
546
	nodes_clear(node_possible_map);
547
	nodes_clear(node_online_map);
L
Linus Torvalds 已提交
548 549 550 551
#endif
	printk(KERN_INFO "%s\n",
	       numa_off ? "NUMA turned off" : "No NUMA configuration found");

T
Thomas Gleixner 已提交
552
	printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
L
Linus Torvalds 已提交
553
	       start_pfn << PAGE_SHIFT,
554
	       last_pfn << PAGE_SHIFT);
T
Thomas Gleixner 已提交
555 556
	/* setup dummy node covering all memory */
	memnode_shift = 63;
557
	memnodemap = memnode.embedded_map;
L
Linus Torvalds 已提交
558 559
	memnodemap[0] = 0;
	node_set_online(0);
560
	node_set(0, node_possible_map);
561
	for (i = 0; i < nr_cpu_ids; i++)
562
		numa_set_node(i, 0);
563 564
	e820_register_active_regions(0, start_pfn, last_pfn);
	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
565 566
}

T
Thomas Gleixner 已提交
567 568
unsigned long __init numa_free_all_bootmem(void)
{
L
Linus Torvalds 已提交
569
	unsigned long pages = 0;
T
Thomas Gleixner 已提交
570 571 572
	int i;

	for_each_online_node(i)
L
Linus Torvalds 已提交
573
		pages += free_all_bootmem_node(NODE_DATA(i));
T
Thomas Gleixner 已提交
574

L
Linus Torvalds 已提交
575
	return pages;
T
Thomas Gleixner 已提交
576
}
L
Linus Torvalds 已提交
577 578

void __init paging_init(void)
T
Thomas Gleixner 已提交
579
{
580
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
581

582 583 584
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
Y
Yinghai Lu 已提交
585
	max_zone_pfns[ZONE_NORMAL] = max_pfn;
B
Bob Picco 已提交
586

587 588
	sparse_memory_present_with_active_regions(MAX_NUMNODES);
	sparse_init();
B
Bob Picco 已提交
589

590
	free_area_init_nodes(max_zone_pfns);
T
Thomas Gleixner 已提交
591
}
L
Linus Torvalds 已提交
592

593
static __init int numa_setup(char *opt)
T
Thomas Gleixner 已提交
594
{
595 596
	if (!opt)
		return -EINVAL;
T
Thomas Gleixner 已提交
597
	if (!strncmp(opt, "off", 3))
L
Linus Torvalds 已提交
598 599
		numa_off = 1;
#ifdef CONFIG_NUMA_EMU
600 601
	if (!strncmp(opt, "fake=", 5))
		cmdline = opt + 5;
L
Linus Torvalds 已提交
602 603
#endif
#ifdef CONFIG_ACPI_NUMA
T
Thomas Gleixner 已提交
604 605 606
	if (!strncmp(opt, "noacpi", 6))
		acpi_numa = -1;
	if (!strncmp(opt, "hotadd=", 7))
607
		hotadd_percent = simple_strtoul(opt+7, NULL, 10);
L
Linus Torvalds 已提交
608
#endif
609
	return 0;
T
Thomas Gleixner 已提交
610
}
611 612
early_param("numa", numa_setup);

613
#ifdef CONFIG_NUMA
614 615 616 617 618 619 620 621 622 623 624
/*
 * Setup early cpu_to_node.
 *
 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
 * and apicid_to_node[] tables have valid entries for a CPU.
 * This means we skip cpu_to_node[] initialisation for NUMA
 * emulation and faking node case (when running a kernel compiled
 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
 * is already initialized in a round robin manner at numa_init_array,
 * prior to this call, and this initialization is good enough
 * for the fake NUMA cases.
625 626
 *
 * Called before the per_cpu areas are setup.
627 628 629
 */
void __init init_cpu_to_node(void)
{
630 631
	int cpu;
	u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
T
Thomas Gleixner 已提交
632

633 634 635
	BUG_ON(cpu_to_apicid == NULL);

	for_each_possible_cpu(cpu) {
636
		int node;
637
		u16 apicid = cpu_to_apicid[cpu];
T
Thomas Gleixner 已提交
638

639 640
		if (apicid == BAD_APICID)
			continue;
641 642
		node = apicid_to_node[apicid];
		if (node == NUMA_NO_NODE)
643
			continue;
644 645
		if (!node_online(node))
			continue;
646
		numa_set_node(cpu, node);
647 648
	}
}
649
#endif
650

651

B
Brian Gerst 已提交
652 653 654 655 656 657 658 659 660 661 662
void __cpuinit numa_set_node(int cpu, int node)
{
	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);

	/* early setting, no percpu area yet */
	if (cpu_to_node_map) {
		cpu_to_node_map[cpu] = node;
		return;
	}

#ifdef CONFIG_DEBUG_PER_CPU_MAPS
B
Brian Gerst 已提交
663
	if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
B
Brian Gerst 已提交
664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683
		printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
		dump_stack();
		return;
	}
#endif
	per_cpu(x86_cpu_to_node_map, cpu) = node;

	if (node != NUMA_NO_NODE)
		per_cpu(node_number, cpu) = node;
}

void __cpuinit numa_clear_node(int cpu)
{
	numa_set_node(cpu, NUMA_NO_NODE);
}

#ifndef CONFIG_DEBUG_PER_CPU_MAPS

void __cpuinit numa_add_cpu(int cpu)
{
684
	cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
B
Brian Gerst 已提交
685 686 687 688
}

void __cpuinit numa_remove_cpu(int cpu)
{
689
	cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
B
Brian Gerst 已提交
690 691 692 693 694 695 696 697 698 699
}

#else /* CONFIG_DEBUG_PER_CPU_MAPS */

/*
 * --------- debug versions of the numa functions ---------
 */
static void __cpuinit numa_set_cpumask(int cpu, int enable)
{
	int node = early_cpu_to_node(cpu);
700
	struct cpumask *mask;
B
Brian Gerst 已提交
701 702
	char buf[64];

703 704 705
	mask = node_to_cpumask_map[node];
	if (mask == NULL) {
		printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node);
B
Brian Gerst 已提交
706 707 708 709 710
		dump_stack();
		return;
	}

	if (enable)
711
		cpumask_set_cpu(cpu, mask);
B
Brian Gerst 已提交
712
	else
713
		cpumask_clear_cpu(cpu, mask);
B
Brian Gerst 已提交
714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750

	cpulist_scnprintf(buf, sizeof(buf), mask);
	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
		enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
}

void __cpuinit numa_add_cpu(int cpu)
{
	numa_set_cpumask(cpu, 1);
}

void __cpuinit numa_remove_cpu(int cpu)
{
	numa_set_cpumask(cpu, 0);
}

int cpu_to_node(int cpu)
{
	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
		printk(KERN_WARNING
			"cpu_to_node(%d): usage too early!\n", cpu);
		dump_stack();
		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
	}
	return per_cpu(x86_cpu_to_node_map, cpu);
}
EXPORT_SYMBOL(cpu_to_node);

/*
 * Same function as cpu_to_node() but used if called before the
 * per_cpu areas are setup.
 */
int early_cpu_to_node(int cpu)
{
	if (early_per_cpu_ptr(x86_cpu_to_node_map))
		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];

B
Brian Gerst 已提交
751
	if (!cpu_possible(cpu)) {
B
Brian Gerst 已提交
752 753 754 755 756 757 758 759 760 761 762 763 764
		printk(KERN_WARNING
			"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
		dump_stack();
		return NUMA_NO_NODE;
	}
	return per_cpu(x86_cpu_to_node_map, cpu);
}

/*
 * --------- end of debug versions of the numa functions ---------
 */

#endif /* CONFIG_DEBUG_PER_CPU_MAPS */