numa_64.c 21.4 KB
Newer Older
T
Thomas Gleixner 已提交
1
/*
L
Linus Torvalds 已提交
2 3
 * Generic VM initialization for x86-64 NUMA setups.
 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
T
Thomas Gleixner 已提交
4
 */
L
Linus Torvalds 已提交
5 6 7 8 9 10 11 12 13
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/mmzone.h>
#include <linux/ctype.h>
#include <linux/module.h>
#include <linux/nodemask.h>
14
#include <linux/sched.h>
L
Linus Torvalds 已提交
15 16 17 18 19 20

#include <asm/e820.h>
#include <asm/proto.h>
#include <asm/dma.h>
#include <asm/numa.h>
#include <asm/acpi.h>
21
#include <asm/k8.h>
L
Linus Torvalds 已提交
22

B
Brian Gerst 已提交
23 24 25 26 27 28
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
# define DBG(x...) printk(KERN_DEBUG x)
#else
# define DBG(x...)
#endif

29
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
T
Thomas Gleixner 已提交
30 31
EXPORT_SYMBOL(node_data);

32
struct memnode memnode;
L
Linus Torvalds 已提交
33

34
s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
T
Thomas Gleixner 已提交
35
	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
36
};
T
Thomas Gleixner 已提交
37

L
Linus Torvalds 已提交
38
int numa_off __initdata;
39 40
static unsigned long __initdata nodemap_addr;
static unsigned long __initdata nodemap_size;
L
Linus Torvalds 已提交
41

B
Brian Gerst 已提交
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
DEFINE_PER_CPU(int, node_number) = 0;
EXPORT_PER_CPU_SYMBOL(node_number);

/*
 * Map cpu index to node index
 */
DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);

/*
 * Which logical CPUs are on which nodes
 */
cpumask_t *node_to_cpumask_map;
EXPORT_SYMBOL(node_to_cpumask_map);

57 58 59 60 61 62 63
/*
 * Given a shift value, try to populate memnodemap[]
 * Returns :
 * 1 if OK
 * 0 if memnodmap[] too small (of shift too small)
 * -1 if node overlap or lost ram (shift too big)
 */
T
Thomas Gleixner 已提交
64
static int __init populate_memnodemap(const struct bootnode *nodes,
65
				      int numnodes, int shift, int *nodeids)
L
Linus Torvalds 已提交
66
{
67
	unsigned long addr, end;
T
Thomas Gleixner 已提交
68
	int i, res = -1;
69

70
	memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
71
	for (i = 0; i < numnodes; i++) {
72 73 74
		addr = nodes[i].start;
		end = nodes[i].end;
		if (addr >= end)
75
			continue;
76
		if ((end >> shift) >= memnodemapsize)
77 78
			return 0;
		do {
79
			if (memnodemap[addr >> shift] != NUMA_NO_NODE)
80
				return -1;
81 82 83 84 85 86

			if (!nodeids)
				memnodemap[addr >> shift] = i;
			else
				memnodemap[addr >> shift] = nodeids[i];

87
			addr += (1UL << shift);
88 89
		} while (addr < end);
		res = 1;
T
Thomas Gleixner 已提交
90
	}
91 92 93
	return res;
}

94 95
static int __init allocate_cachealigned_memnodemap(void)
{
96
	unsigned long addr;
97 98

	memnodemap = memnode.embedded_map;
99
	if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
100 101
		return 0;

102
	addr = 0x8000;
103
	nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
Y
Yinghai Lu 已提交
104
	nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
105
				      nodemap_size, L1_CACHE_BYTES);
106 107 108 109 110 111
	if (nodemap_addr == -1UL) {
		printk(KERN_ERR
		       "NUMA: Unable to allocate Memory to Node hash map\n");
		nodemap_addr = nodemap_size = 0;
		return -1;
	}
112
	memnodemap = phys_to_virt(nodemap_addr);
113
	reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
114 115 116 117 118 119 120 121 122 123

	printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
	       nodemap_addr, nodemap_addr + nodemap_size);
	return 0;
}

/*
 * The LSB of all start and end addresses in the node map is the value of the
 * maximum possible shift.
 */
T
Thomas Gleixner 已提交
124 125
static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
					 int numnodes)
126
{
127
	int i, nodes_used = 0;
128 129 130 131 132 133 134 135
	unsigned long start, end;
	unsigned long bitfield = 0, memtop = 0;

	for (i = 0; i < numnodes; i++) {
		start = nodes[i].start;
		end = nodes[i].end;
		if (start >= end)
			continue;
136 137
		bitfield |= start;
		nodes_used++;
138 139 140
		if (end > memtop)
			memtop = end;
	}
141 142 143 144
	if (nodes_used <= 1)
		i = 63;
	else
		i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
145 146 147
	memnodemapsize = (memtop >> i)+1;
	return i;
}
148

149 150
int __init compute_hash_shift(struct bootnode *nodes, int numnodes,
			      int *nodeids)
151 152
{
	int shift;
153

154 155 156
	shift = extract_lsb_from_nodes(nodes, numnodes);
	if (allocate_cachealigned_memnodemap())
		return -1;
157
	printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
158 159
		shift);

160
	if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) {
T
Thomas Gleixner 已提交
161 162 163
		printk(KERN_INFO "Your memory is not aligned you need to "
		       "rebuild your kernel with a bigger NODEMAPSIZE "
		       "shift=%d\n", shift);
164 165
		return -1;
	}
166
	return shift;
L
Linus Torvalds 已提交
167 168
}

169 170 171 172 173
int early_pfn_to_nid(unsigned long pfn)
{
	return phys_to_nid(pfn << PAGE_SHIFT);
}

T
Thomas Gleixner 已提交
174
static void * __init early_node_mem(int nodeid, unsigned long start,
175 176
				    unsigned long end, unsigned long size,
				    unsigned long align)
177
{
178
	unsigned long mem = find_e820_area(start, end, size, align);
179
	void *ptr;
T
Thomas Gleixner 已提交
180

Y
Yinghai Lu 已提交
181
	if (mem != -1L)
182
		return __va(mem);
Y
Yinghai Lu 已提交
183

184
	ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
Y
Yoann Padioleau 已提交
185
	if (ptr == NULL) {
186
		printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
T
Thomas Gleixner 已提交
187
		       size, nodeid);
188 189 190 191 192
		return NULL;
	}
	return ptr;
}

L
Linus Torvalds 已提交
193
/* Initialize bootmem allocator for a node */
T
Thomas Gleixner 已提交
194 195 196
void __init setup_node_bootmem(int nodeid, unsigned long start,
			       unsigned long end)
{
197
	unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
T
Thomas Gleixner 已提交
198
	unsigned long bootmap_start, nodedata_phys;
199
	void *bootmap;
200
	const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
201
	int nid;
L
Linus Torvalds 已提交
202

203
	start = roundup(start, ZONE_ALIGN);
L
Linus Torvalds 已提交
204

T
Thomas Gleixner 已提交
205 206
	printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
	       start, end);
L
Linus Torvalds 已提交
207 208

	start_pfn = start >> PAGE_SHIFT;
209
	last_pfn = end >> PAGE_SHIFT;
L
Linus Torvalds 已提交
210

211 212
	node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
					   SMP_CACHE_BYTES);
213 214 215
	if (node_data[nodeid] == NULL)
		return;
	nodedata_phys = __pa(node_data[nodeid]);
216 217
	printk(KERN_INFO "  NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
		nodedata_phys + pgdat_size - 1);
L
Linus Torvalds 已提交
218 219

	memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
220
	NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
L
Linus Torvalds 已提交
221
	NODE_DATA(nodeid)->node_start_pfn = start_pfn;
222
	NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
L
Linus Torvalds 已提交
223

224 225 226 227 228 229 230
	/*
	 * Find a place for the bootmem map
	 * nodedata_phys could be on other nodes by alloc_bootmem,
	 * so need to sure bootmap_start not to be small, otherwise
	 * early_node_mem will get that with find_e820_area instead
	 * of alloc_bootmem, that could clash with reserved range
	 */
231
	bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
232 233
	nid = phys_to_nid(nodedata_phys);
	if (nid == nodeid)
234
		bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
235
	else
236
		bootmap_start = roundup(start, PAGE_SIZE);
237
	/*
238
	 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
239 240
	 * to use that to align to PAGE_SIZE
	 */
241
	bootmap = early_node_mem(nodeid, bootmap_start, end,
242
				 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
243 244
	if (bootmap == NULL)  {
		if (nodedata_phys < start || nodedata_phys >= end)
245
			free_bootmem(nodedata_phys, pgdat_size);
246 247 248 249
		node_data[nodeid] = NULL;
		return;
	}
	bootmap_start = __pa(bootmap);
T
Thomas Gleixner 已提交
250

L
Linus Torvalds 已提交
251
	bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
T
Thomas Gleixner 已提交
252
					 bootmap_start >> PAGE_SHIFT,
253
					 start_pfn, last_pfn);
L
Linus Torvalds 已提交
254

255 256 257 258
	printk(KERN_INFO "  bootmap [%016lx -  %016lx] pages %lx\n",
		 bootmap_start, bootmap_start + bootmap_size - 1,
		 bootmap_pages);

259
	free_bootmem_with_active_regions(nodeid, end);
L
Linus Torvalds 已提交
260

261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283
	/*
	 * convert early reserve to bootmem reserve earlier
	 * otherwise early_node_mem could use early reserved mem
	 * on previous node
	 */
	early_res_to_bootmem(start, end);

	/*
	 * in some case early_node_mem could use alloc_bootmem
	 * to get range on other node, don't reserve that again
	 */
	if (nid != nodeid)
		printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nodeid, nid);
	else
		reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys,
					pgdat_size, BOOTMEM_DEFAULT);
	nid = phys_to_nid(bootmap_start);
	if (nid != nodeid)
		printk(KERN_INFO "    bootmap(%d) on node %d\n", nodeid, nid);
	else
		reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
				 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);

284 285 286
#ifdef CONFIG_ACPI_NUMA
	srat_reserve_add_area(nodeid);
#endif
L
Linus Torvalds 已提交
287
	node_set_online(nodeid);
T
Thomas Gleixner 已提交
288
}
L
Linus Torvalds 已提交
289

T
Thomas Gleixner 已提交
290 291 292 293 294 295 296
/*
 * There are unfortunately some poorly designed mainboards around that
 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
 * mapping. To avoid this fill in the mapping for all possible CPUs,
 * as the number of CPUs is not known yet. We round robin the existing
 * nodes.
 */
L
Linus Torvalds 已提交
297 298 299
void __init numa_init_array(void)
{
	int rr, i;
T
Thomas Gleixner 已提交
300

301
	rr = first_node(node_online_map);
302
	for (i = 0; i < nr_cpu_ids; i++) {
303
		if (early_cpu_to_node(i) != NUMA_NO_NODE)
L
Linus Torvalds 已提交
304
			continue;
T
Thomas Gleixner 已提交
305
		numa_set_node(i, rr);
L
Linus Torvalds 已提交
306 307 308 309 310 311 312
		rr = next_node(rr, node_online_map);
		if (rr == MAX_NUMNODES)
			rr = first_node(node_online_map);
	}
}

#ifdef CONFIG_NUMA_EMU
313
/* Numa emulation */
314
static char *cmdline __initdata;
L
Linus Torvalds 已提交
315

316
/*
T
Thomas Gleixner 已提交
317 318 319 320 321
 * Setups up nid to range from addr to addr + size.  If the end
 * boundary is greater than max_addr, then max_addr is used instead.
 * The return value is 0 if there is additional memory left for
 * allocation past addr and -1 otherwise.  addr is adjusted to be at
 * the end of the node.
322
 */
323 324
static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
				   u64 size, u64 max_addr)
325
{
326
	int ret = 0;
T
Thomas Gleixner 已提交
327

328 329 330 331 332 333 334
	nodes[nid].start = *addr;
	*addr += size;
	if (*addr >= max_addr) {
		*addr = max_addr;
		ret = -1;
	}
	nodes[nid].end = *addr;
335
	node_set(nid, node_possible_map);
336 337 338 339
	printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
	       nodes[nid].start, nodes[nid].end,
	       (nodes[nid].end - nodes[nid].start) >> 20);
	return ret;
340 341
}

342 343 344 345 346 347 348 349
/*
 * Splits num_nodes nodes up equally starting at node_start.  The return value
 * is the number of nodes split up and addr is adjusted to be at the end of the
 * last node allocated.
 */
static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
				      u64 max_addr, int node_start,
				      int num_nodes)
L
Linus Torvalds 已提交
350
{
351 352 353
	unsigned int big;
	u64 size;
	int i;
354

355 356 357 358
	if (num_nodes <= 0)
		return -1;
	if (num_nodes > MAX_NUMNODES)
		num_nodes = MAX_NUMNODES;
359
	size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) /
360
	       num_nodes;
361
	/*
362 363
	 * Calculate the number of big nodes that can be allocated as a result
	 * of consolidating the leftovers.
364
	 */
365 366 367 368 369 370 371 372 373
	big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
	      FAKE_NODE_MIN_SIZE;

	/* Round down to nearest FAKE_NODE_MIN_SIZE. */
	size &= FAKE_NODE_MIN_HASH_MASK;
	if (!size) {
		printk(KERN_ERR "Not enough memory for each node.  "
		       "NUMA emulation disabled.\n");
		return -1;
374
	}
375 376 377

	for (i = node_start; i < num_nodes + node_start; i++) {
		u64 end = *addr + size;
T
Thomas Gleixner 已提交
378

379 380 381
		if (i < big)
			end += FAKE_NODE_MIN_SIZE;
		/*
382 383
		 * The final node can have the remaining system RAM.  Other
		 * nodes receive roughly the same amount of available pages.
384
		 */
385 386 387
		if (i == num_nodes + node_start - 1)
			end = max_addr;
		else
388
			while (end - *addr - e820_hole_size(*addr, end) <
389 390 391 392 393 394 395 396 397 398 399 400 401
			       size) {
				end += FAKE_NODE_MIN_SIZE;
				if (end > max_addr) {
					end = max_addr;
					break;
				}
			}
		if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
			break;
	}
	return i - node_start + 1;
}

402 403 404 405 406 407 408 409 410 411 412 413 414 415 416
/*
 * Splits the remaining system RAM into chunks of size.  The remaining memory is
 * always assigned to a final node and can be asymmetric.  Returns the number of
 * nodes split.
 */
static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
				      u64 max_addr, int node_start, u64 size)
{
	int i = node_start;
	size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
	while (!setup_node_range(i++, nodes, addr, size, max_addr))
		;
	return i - node_start;
}

417
/*
418
 * Sets up the system RAM area from start_pfn to last_pfn according to the
419 420
 * numa=fake command-line option.
 */
421 422
static struct bootnode nodes[MAX_NUMNODES] __initdata;

423
static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
424
{
T
Thomas Gleixner 已提交
425
	u64 size, addr = start_pfn << PAGE_SHIFT;
426
	u64 max_addr = last_pfn << PAGE_SHIFT;
T
Thomas Gleixner 已提交
427
	int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
428 429 430 431 432 433 434

	memset(&nodes, 0, sizeof(nodes));
	/*
	 * If the numa=fake command-line is just a single number N, split the
	 * system RAM into N fake nodes.
	 */
	if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
T
Thomas Gleixner 已提交
435 436 437
		long n = simple_strtol(cmdline, NULL, 0);

		num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
438 439 440 441 442 443
		if (num_nodes < 0)
			return num_nodes;
		goto out;
	}

	/* Parse the command line. */
444
	for (coeff_flag = 0; ; cmdline++) {
445 446 447
		if (*cmdline && isdigit(*cmdline)) {
			num = num * 10 + *cmdline - '0';
			continue;
448
		}
449 450 451 452 453
		if (*cmdline == '*') {
			if (num > 0)
				coeff = num;
			coeff_flag = 1;
		}
454
		if (!*cmdline || *cmdline == ',') {
455 456
			if (!coeff_flag)
				coeff = 1;
457 458 459 460 461
			/*
			 * Round down to the nearest FAKE_NODE_MIN_SIZE.
			 * Command-line coefficients are in megabytes.
			 */
			size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
462
			if (size)
463 464 465 466
				for (i = 0; i < coeff; i++, num_nodes++)
					if (setup_node_range(num_nodes, nodes,
						&addr, size, max_addr) < 0)
						goto done;
467 468 469 470
			if (!*cmdline)
				break;
			coeff_flag = 0;
			coeff = -1;
471
		}
472 473 474 475 476
		num = 0;
	}
done:
	if (!num_nodes)
		return -1;
477
	/* Fill remainder of system RAM, if appropriate. */
478
	if (addr < max_addr) {
479 480 481 482 483 484
		if (coeff_flag && coeff < 0) {
			/* Split remaining nodes into num-sized chunks */
			num_nodes += split_nodes_by_size(nodes, &addr, max_addr,
							 num_nodes, num);
			goto out;
		}
485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501
		switch (*(cmdline - 1)) {
		case '*':
			/* Split remaining nodes into coeff chunks */
			if (coeff <= 0)
				break;
			num_nodes += split_nodes_equally(nodes, &addr, max_addr,
							 num_nodes, coeff);
			break;
		case ',':
			/* Do not allocate remaining system RAM */
			break;
		default:
			/* Give one final node */
			setup_node_range(num_nodes, nodes, &addr,
					 max_addr - addr, max_addr);
			num_nodes++;
		}
502 503
	}
out:
504
	memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
505 506 507 508 509 510 511 512 513
	if (memnode_shift < 0) {
		memnode_shift = 0;
		printk(KERN_ERR "No NUMA hash function found.  NUMA emulation "
		       "disabled.\n");
		return -1;
	}

	/*
	 * We need to vacate all active ranges that may have been registered by
514 515
	 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
	 * true.  NUMA emulation has succeeded so we will not scan ACPI nodes.
516 517
	 */
	remove_all_active_ranges();
518 519 520
#ifdef CONFIG_ACPI_NUMA
	acpi_numa = -1;
#endif
521
	for_each_node_mask(i, node_possible_map) {
522 523
		e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
						nodes[i].end >> PAGE_SHIFT);
T
Thomas Gleixner 已提交
524
		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
525
	}
526
	acpi_fake_nodes(nodes, num_nodes);
T
Thomas Gleixner 已提交
527 528
	numa_init_array();
	return 0;
L
Linus Torvalds 已提交
529
}
530
#endif /* CONFIG_NUMA_EMU */
L
Linus Torvalds 已提交
531

532
void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
T
Thomas Gleixner 已提交
533
{
L
Linus Torvalds 已提交
534 535
	int i;

536
	nodes_clear(node_possible_map);
537
	nodes_clear(node_online_map);
538

L
Linus Torvalds 已提交
539
#ifdef CONFIG_NUMA_EMU
540
	if (cmdline && !numa_emulation(start_pfn, last_pfn))
T
Thomas Gleixner 已提交
541
		return;
542
	nodes_clear(node_possible_map);
543
	nodes_clear(node_online_map);
L
Linus Torvalds 已提交
544 545 546 547
#endif

#ifdef CONFIG_ACPI_NUMA
	if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
548
					  last_pfn << PAGE_SHIFT))
T
Thomas Gleixner 已提交
549
		return;
550
	nodes_clear(node_possible_map);
551
	nodes_clear(node_online_map);
L
Linus Torvalds 已提交
552 553 554
#endif

#ifdef CONFIG_K8_NUMA
T
Thomas Gleixner 已提交
555
	if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
556
					last_pfn<<PAGE_SHIFT))
L
Linus Torvalds 已提交
557
		return;
558
	nodes_clear(node_possible_map);
559
	nodes_clear(node_online_map);
L
Linus Torvalds 已提交
560 561 562 563
#endif
	printk(KERN_INFO "%s\n",
	       numa_off ? "NUMA turned off" : "No NUMA configuration found");

T
Thomas Gleixner 已提交
564
	printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
L
Linus Torvalds 已提交
565
	       start_pfn << PAGE_SHIFT,
566
	       last_pfn << PAGE_SHIFT);
T
Thomas Gleixner 已提交
567 568
	/* setup dummy node covering all memory */
	memnode_shift = 63;
569
	memnodemap = memnode.embedded_map;
L
Linus Torvalds 已提交
570 571
	memnodemap[0] = 0;
	node_set_online(0);
572
	node_set(0, node_possible_map);
573
	for (i = 0; i < nr_cpu_ids; i++)
574
		numa_set_node(i, 0);
575 576
	e820_register_active_regions(0, start_pfn, last_pfn);
	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
577 578
}

T
Thomas Gleixner 已提交
579 580
unsigned long __init numa_free_all_bootmem(void)
{
L
Linus Torvalds 已提交
581
	unsigned long pages = 0;
T
Thomas Gleixner 已提交
582 583 584
	int i;

	for_each_online_node(i)
L
Linus Torvalds 已提交
585
		pages += free_all_bootmem_node(NODE_DATA(i));
T
Thomas Gleixner 已提交
586

L
Linus Torvalds 已提交
587
	return pages;
T
Thomas Gleixner 已提交
588
}
L
Linus Torvalds 已提交
589 590

void __init paging_init(void)
T
Thomas Gleixner 已提交
591
{
592
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
593

594 595 596
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
Y
Yinghai Lu 已提交
597
	max_zone_pfns[ZONE_NORMAL] = max_pfn;
B
Bob Picco 已提交
598

599 600
	sparse_memory_present_with_active_regions(MAX_NUMNODES);
	sparse_init();
B
Bob Picco 已提交
601

602
	free_area_init_nodes(max_zone_pfns);
T
Thomas Gleixner 已提交
603
}
L
Linus Torvalds 已提交
604

605
static __init int numa_setup(char *opt)
T
Thomas Gleixner 已提交
606
{
607 608
	if (!opt)
		return -EINVAL;
T
Thomas Gleixner 已提交
609
	if (!strncmp(opt, "off", 3))
L
Linus Torvalds 已提交
610 611
		numa_off = 1;
#ifdef CONFIG_NUMA_EMU
612 613
	if (!strncmp(opt, "fake=", 5))
		cmdline = opt + 5;
L
Linus Torvalds 已提交
614 615
#endif
#ifdef CONFIG_ACPI_NUMA
T
Thomas Gleixner 已提交
616 617 618
	if (!strncmp(opt, "noacpi", 6))
		acpi_numa = -1;
	if (!strncmp(opt, "hotadd=", 7))
619
		hotadd_percent = simple_strtoul(opt+7, NULL, 10);
L
Linus Torvalds 已提交
620
#endif
621
	return 0;
T
Thomas Gleixner 已提交
622
}
623 624
early_param("numa", numa_setup);

625
#ifdef CONFIG_NUMA
626 627 628 629 630 631 632 633 634 635 636
/*
 * Setup early cpu_to_node.
 *
 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
 * and apicid_to_node[] tables have valid entries for a CPU.
 * This means we skip cpu_to_node[] initialisation for NUMA
 * emulation and faking node case (when running a kernel compiled
 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
 * is already initialized in a round robin manner at numa_init_array,
 * prior to this call, and this initialization is good enough
 * for the fake NUMA cases.
637 638
 *
 * Called before the per_cpu areas are setup.
639 640 641
 */
void __init init_cpu_to_node(void)
{
642 643
	int cpu;
	u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
T
Thomas Gleixner 已提交
644

645 646 647
	BUG_ON(cpu_to_apicid == NULL);

	for_each_possible_cpu(cpu) {
648
		int node;
649
		u16 apicid = cpu_to_apicid[cpu];
T
Thomas Gleixner 已提交
650

651 652
		if (apicid == BAD_APICID)
			continue;
653 654
		node = apicid_to_node[apicid];
		if (node == NUMA_NO_NODE)
655
			continue;
656 657
		if (!node_online(node))
			continue;
658
		numa_set_node(cpu, node);
659 660
	}
}
661
#endif
662

663

B
Brian Gerst 已提交
664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859
/*
 * Allocate node_to_cpumask_map based on number of available nodes
 * Requires node_possible_map to be valid.
 *
 * Note: node_to_cpumask() is not valid until after this is done.
 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
 */
void __init setup_node_to_cpumask_map(void)
{
	unsigned int node, num = 0;
	cpumask_t *map;

	/* setup nr_node_ids if not done yet */
	if (nr_node_ids == MAX_NUMNODES) {
		for_each_node_mask(node, node_possible_map)
			num = node;
		nr_node_ids = num + 1;
	}

	/* allocate the map */
	map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
	DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids);

	pr_debug("Node to cpumask map at %p for %d nodes\n",
		 map, nr_node_ids);

	/* node_to_cpumask() will now work */
	node_to_cpumask_map = map;
}

void __cpuinit numa_set_node(int cpu, int node)
{
	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);

	/* early setting, no percpu area yet */
	if (cpu_to_node_map) {
		cpu_to_node_map[cpu] = node;
		return;
	}

#ifdef CONFIG_DEBUG_PER_CPU_MAPS
	if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) {
		printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
		dump_stack();
		return;
	}
#endif
	per_cpu(x86_cpu_to_node_map, cpu) = node;

	if (node != NUMA_NO_NODE)
		per_cpu(node_number, cpu) = node;
}

void __cpuinit numa_clear_node(int cpu)
{
	numa_set_node(cpu, NUMA_NO_NODE);
}

#ifndef CONFIG_DEBUG_PER_CPU_MAPS

void __cpuinit numa_add_cpu(int cpu)
{
	cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
}

void __cpuinit numa_remove_cpu(int cpu)
{
	cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
}

#else /* CONFIG_DEBUG_PER_CPU_MAPS */

/*
 * --------- debug versions of the numa functions ---------
 */
static void __cpuinit numa_set_cpumask(int cpu, int enable)
{
	int node = early_cpu_to_node(cpu);
	cpumask_t *mask;
	char buf[64];

	if (node_to_cpumask_map == NULL) {
		printk(KERN_ERR "node_to_cpumask_map NULL\n");
		dump_stack();
		return;
	}

	mask = &node_to_cpumask_map[node];
	if (enable)
		cpu_set(cpu, *mask);
	else
		cpu_clear(cpu, *mask);

	cpulist_scnprintf(buf, sizeof(buf), mask);
	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
		enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
}

void __cpuinit numa_add_cpu(int cpu)
{
	numa_set_cpumask(cpu, 1);
}

void __cpuinit numa_remove_cpu(int cpu)
{
	numa_set_cpumask(cpu, 0);
}

int cpu_to_node(int cpu)
{
	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
		printk(KERN_WARNING
			"cpu_to_node(%d): usage too early!\n", cpu);
		dump_stack();
		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
	}
	return per_cpu(x86_cpu_to_node_map, cpu);
}
EXPORT_SYMBOL(cpu_to_node);

/*
 * Same function as cpu_to_node() but used if called before the
 * per_cpu areas are setup.
 */
int early_cpu_to_node(int cpu)
{
	if (early_per_cpu_ptr(x86_cpu_to_node_map))
		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];

	if (!per_cpu_offset(cpu)) {
		printk(KERN_WARNING
			"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
		dump_stack();
		return NUMA_NO_NODE;
	}
	return per_cpu(x86_cpu_to_node_map, cpu);
}


/* empty cpumask */
static const cpumask_t cpu_mask_none;

/*
 * Returns a pointer to the bitmask of CPUs on Node 'node'.
 */
const cpumask_t *cpumask_of_node(int node)
{
	if (node_to_cpumask_map == NULL) {
		printk(KERN_WARNING
			"cpumask_of_node(%d): no node_to_cpumask_map!\n",
			node);
		dump_stack();
		return (const cpumask_t *)&cpu_online_map;
	}
	if (node >= nr_node_ids) {
		printk(KERN_WARNING
			"cpumask_of_node(%d): node > nr_node_ids(%d)\n",
			node, nr_node_ids);
		dump_stack();
		return &cpu_mask_none;
	}
	return &node_to_cpumask_map[node];
}
EXPORT_SYMBOL(cpumask_of_node);

/*
 * Returns a bitmask of CPUs on Node 'node'.
 *
 * Side note: this function creates the returned cpumask on the stack
 * so with a high NR_CPUS count, excessive stack space is used.  The
 * node_to_cpumask_ptr function should be used whenever possible.
 */
cpumask_t node_to_cpumask(int node)
{
	if (node_to_cpumask_map == NULL) {
		printk(KERN_WARNING
			"node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
		dump_stack();
		return cpu_online_map;
	}
	if (node >= nr_node_ids) {
		printk(KERN_WARNING
			"node_to_cpumask(%d): node > nr_node_ids(%d)\n",
			node, nr_node_ids);
		dump_stack();
		return cpu_mask_none;
	}
	return node_to_cpumask_map[node];
}
EXPORT_SYMBOL(node_to_cpumask);

/*
 * --------- end of debug versions of the numa functions ---------
 */

#endif /* CONFIG_DEBUG_PER_CPU_MAPS */