numa_32.c 13.9 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation
 * August 2002: added remote node KVA remap - Martin J. Bligh 
 *
 * Copyright (C) 2002, IBM Corp.
 *
 * All rights reserved.          
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
 * NON INFRINGEMENT.  See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

#include <linux/mm.h>
#include <linux/bootmem.h>
27
#include <linux/memblock.h>
L
Linus Torvalds 已提交
28 29 30 31
#include <linux/mmzone.h>
#include <linux/highmem.h>
#include <linux/initrd.h>
#include <linux/nodemask.h>
32
#include <linux/module.h>
33
#include <linux/kexec.h>
D
Dave Hansen 已提交
34
#include <linux/pfn.h>
35
#include <linux/swap.h>
M
Mel Gorman 已提交
36
#include <linux/acpi.h>
37

L
Linus Torvalds 已提交
38 39 40
#include <asm/e820.h>
#include <asm/setup.h>
#include <asm/mmzone.h>
41
#include <asm/bios_ebda.h>
42
#include <asm/proto.h>
L
Linus Torvalds 已提交
43 44

/*
A
Adrian Bunk 已提交
45
 * numa interface - we expect the numa architecture specific code to have
L
Linus Torvalds 已提交
46 47 48
 *                  populated the following initialisation.
 *
 * 1) node_online_map  - the map of all nodes configured (online) in the system
49
 * 2) node_start_pfn   - the starting page frame number for a node
L
Linus Torvalds 已提交
50 51
 * 3) node_end_pfn     - the ending page fram number for a node
 */
52 53
unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
54

L
Linus Torvalds 已提交
55

56
#ifdef CONFIG_DISCONTIGMEM
L
Linus Torvalds 已提交
57
/*
58
 * 4) physnode_map     - the mapping between a pfn and owning node
L
Linus Torvalds 已提交
59
 * physnode_map keeps track of the physical memory layout of a generic
60 61
 * numa node on a 64Mb break (each element of the array will
 * represent 64Mb of memory and will be marked by the node id.  so,
L
Linus Torvalds 已提交
62 63 64
 * if the first gig is on node 0, and the second gig is on node 1
 * physnode_map will contain:
 *
65 66 67
 *     physnode_map[0-15] = 0;
 *     physnode_map[16-31] = 1;
 *     physnode_map[32- ] = -1;
L
Linus Torvalds 已提交
68
 */
69
s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
70
EXPORT_SYMBOL(physnode_map);
L
Linus Torvalds 已提交
71 72 73 74 75

void memory_present(int nid, unsigned long start, unsigned long end)
{
	unsigned long pfn;

76
	printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n",
L
Linus Torvalds 已提交
77 78 79 80 81
			nid, start, end);
	printk(KERN_DEBUG "  Setting physnode_map array to node %d for pfns:\n", nid);
	printk(KERN_DEBUG "  ");
	for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
		physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
82
		printk(KERN_CONT "%lx ", pfn);
L
Linus Torvalds 已提交
83
	}
84
	printk(KERN_CONT "\n");
L
Linus Torvalds 已提交
85 86 87 88 89 90 91 92 93 94 95 96
}

unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
					      unsigned long end_pfn)
{
	unsigned long nr_pages = end_pfn - start_pfn;

	if (!nr_pages)
		return 0;

	return (nr_pages + 1) * sizeof(struct page);
}
97
#endif
L
Linus Torvalds 已提交
98 99 100 101 102 103

extern unsigned long find_max_low_pfn(void);
extern unsigned long highend_pfn, highstart_pfn;

#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)

A
Adrian Bunk 已提交
104
static void *node_remap_start_vaddr[MAX_NUMNODES];
L
Linus Torvalds 已提交
105 106 107 108 109 110 111
void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);

/*
 * FLAT - support for basic PC memory model with discontig enabled, essentially
 *        a single node with all available processors in it with a flat
 *        memory map.
 */
112
static int __init get_memcfg_numa_flat(void)
L
Linus Torvalds 已提交
113
{
114
	printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
L
Linus Torvalds 已提交
115 116 117

	node_start_pfn[0] = 0;
	node_end_pfn[0] = max_pfn;
118
	memblock_x86_register_active_regions(0, 0, max_pfn);
L
Linus Torvalds 已提交
119 120 121 122 123 124 125 126 127 128

        /* Indicate there is one node available. */
	nodes_clear(node_online_map);
	node_set_online(0);
	return 1;
}

/*
 * Find the highest page frame number we have available for the node
 */
129
static void __init propagate_e820_map_node(int nid)
L
Linus Torvalds 已提交
130 131 132 133 134 135 136 137 138
{
	if (node_end_pfn[nid] > max_pfn)
		node_end_pfn[nid] = max_pfn;
	/*
	 * if a user has given mem=XXXX, then we need to make sure 
	 * that the node _starts_ before that, too, not just ends
	 */
	if (node_start_pfn[nid] > max_pfn)
		node_start_pfn[nid] = max_pfn;
E
Eric Sesterhenn 已提交
139
	BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]);
L
Linus Torvalds 已提交
140 141 142 143 144 145 146 147 148 149 150
}

/* 
 * Allocate memory for the pg_data_t for this node via a crude pre-bootmem
 * method.  For node zero take this from the bottom of memory, for
 * subsequent nodes place them at node_remap_start_vaddr which contains
 * node local data in physically node local memory.  See setup_memory()
 * for details.
 */
static void __init allocate_pgdat(int nid)
{
Y
Yinghai Lu 已提交
151 152
	char buf[16];

153 154
	NODE_DATA(nid) = alloc_remap(nid, ALIGN(sizeof(pg_data_t), PAGE_SIZE));
	if (!NODE_DATA(nid)) {
155
		unsigned long pgdat_phys;
156
		pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT,
Y
Yinghai Lu 已提交
157
				 max_pfn_mapped<<PAGE_SHIFT,
158
				 sizeof(pg_data_t),
159 160
				 PAGE_SIZE);
		NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
Y
Yinghai Lu 已提交
161 162
		memset(buf, 0, sizeof(buf));
		sprintf(buf, "NODE_DATA %d",  nid);
163
		memblock_x86_reserve_range(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
L
Linus Torvalds 已提交
164
	}
165 166
	printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
		nid, (unsigned long)NODE_DATA(nid));
L
Linus Torvalds 已提交
167 168
}

M
Mel Gorman 已提交
169
/*
170
 * Remap memory allocator
M
Mel Gorman 已提交
171 172 173 174 175
 */
static unsigned long node_remap_start_pfn[MAX_NUMNODES];
static void *node_remap_end_vaddr[MAX_NUMNODES];
static void *node_remap_alloc_vaddr[MAX_NUMNODES];

176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
/**
 * alloc_remap - Allocate remapped memory
 * @nid: NUMA node to allocate memory from
 * @size: The size of allocation
 *
 * Allocate @size bytes from the remap area of NUMA node @nid.  The
 * size of the remap area is predetermined by init_alloc_remap() and
 * only the callers considered there should call this function.  For
 * more info, please read the comment on top of init_alloc_remap().
 *
 * The caller must be ready to handle allocation failure from this
 * function and fall back to regular memory allocator in such cases.
 *
 * CONTEXT:
 * Single CPU early boot context.
 *
 * RETURNS:
 * Pointer to the allocated memory on success, %NULL on failure.
 */
195 196 197 198 199 200
void *alloc_remap(int nid, unsigned long size)
{
	void *allocation = node_remap_alloc_vaddr[nid];

	size = ALIGN(size, L1_CACHE_BYTES);

201
	if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
202
		return NULL;
203 204 205 206 207 208 209

	node_remap_alloc_vaddr[nid] += size;
	memset(allocation, 0, size);

	return allocation;
}

210 211 212 213 214 215 216 217 218 219 220
#ifdef CONFIG_HIBERNATION
/**
 * resume_map_numa_kva - add KVA mapping to the temporary page tables created
 *                       during resume from hibernation
 * @pgd_base - temporary resume page directory
 */
void resume_map_numa_kva(pgd_t *pgd_base)
{
	int node;

	for_each_online_node(node) {
221
		unsigned long start_va, start_pfn, nr_pages, pfn;
222 223 224

		start_va = (unsigned long)node_remap_start_vaddr[node];
		start_pfn = node_remap_start_pfn[node];
225 226
		nr_pages = (node_remap_end_vaddr[node] -
			    node_remap_start_vaddr[node]) >> PAGE_SHIFT;
227

228
		printk(KERN_DEBUG "%s: node %d\n", __func__, node);
229

230
		for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
231 232 233 234 235 236 237 238 239
			unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
			pgd_t *pgd = pgd_base + pgd_index(vaddr);
			pud_t *pud = pud_offset(pgd, vaddr);
			pmd_t *pmd = pmd_offset(pud, vaddr);

			set_pmd(pmd, pfn_pmd(start_pfn + pfn,
						PAGE_KERNEL_LARGE_EXEC));

			printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n",
240
				__func__, vaddr, start_pfn + pfn);
241 242 243 244 245
		}
	}
}
#endif

246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
/**
 * init_alloc_remap - Initialize remap allocator for a NUMA node
 * @nid: NUMA node to initizlie remap allocator for
 *
 * NUMA nodes may end up without any lowmem.  As allocating pgdat and
 * memmap on a different node with lowmem is inefficient, a special
 * remap allocator is implemented which can be used by alloc_remap().
 *
 * For each node, the amount of memory which will be necessary for
 * pgdat and memmap is calculated and two memory areas of the size are
 * allocated - one in the node and the other in lowmem; then, the area
 * in the node is remapped to the lowmem area.
 *
 * As pgdat and memmap must be allocated in lowmem anyway, this
 * doesn't waste lowmem address space; however, the actual lowmem
 * which gets remapped over is wasted.  The amount shouldn't be
 * problematic on machines this feature will be used.
 *
 * Initialization failure isn't fatal.  alloc_remap() is used
 * opportunistically and the callers will fall back to other memory
 * allocation mechanisms on failure.
 */
268
void __init init_alloc_remap(int nid, u64 start, u64 end)
L
Linus Torvalds 已提交
269
{
270 271
	unsigned long start_pfn = start >> PAGE_SHIFT;
	unsigned long end_pfn = end >> PAGE_SHIFT;
272
	unsigned long size, pfn;
273 274
	u64 node_pa, remap_pa;
	void *remap_va;
275 276 277 278 279 280

	/*
	 * The acpi/srat node info can show hot-add memroy zones where
	 * memory could be added but not currently present.
	 */
	printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
281
	       nid, start_pfn, end_pfn);
282

283
	/* calculate the necessary space aligned to large page size */
284
	size = node_memmap_size_bytes(nid, start_pfn, end_pfn);
285
	size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
286
	size = ALIGN(size, LARGE_PAGE_BYTES);
287

288
	/* allocate node memory and the lowmem remap area */
289
	node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
290 291 292
	if (node_pa == MEMBLOCK_ERROR) {
		pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
			   size, nid);
293
		return;
294
	}
295 296 297 298 299 300 301 302 303
	memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM");

	remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
					  max_low_pfn << PAGE_SHIFT,
					  size, LARGE_PAGE_BYTES);
	if (remap_pa == MEMBLOCK_ERROR) {
		pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
			   size, nid);
		memblock_x86_free_range(node_pa, node_pa + size);
304
		return;
305 306 307
	}
	memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG");
	remap_va = phys_to_virt(remap_pa);
308

309 310 311 312 313 314
	/* perform actual remap */
	for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
		set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
			    (node_pa >> PAGE_SHIFT) + pfn,
			    PAGE_KERNEL_LARGE);

315 316 317 318
	/* initialize remap allocator parameters */
	node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
	node_remap_start_vaddr[nid] = remap_va;
	node_remap_end_vaddr[nid] = remap_va + size;
319
	node_remap_alloc_vaddr[nid] = remap_va;
320

321 322
	printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
	       nid, node_pa, node_pa + size, remap_va, remap_va + size);
L
Linus Torvalds 已提交
323 324
}

325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347
static int get_memcfg_numaq(void)
{
#ifdef CONFIG_X86_NUMAQ
	int nid;

	if (numa_off)
		return 0;

	if (numaq_numa_init() < 0) {
		nodes_clear(numa_nodes_parsed);
		remove_all_active_ranges();
		return 0;
	}

	for_each_node_mask(nid, numa_nodes_parsed)
		node_set_online(nid);
	sort_node_map();
	return 1;
#else
	return 0;
#endif
}

348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370
static int get_memcfg_from_srat(void)
{
#ifdef CONFIG_ACPI_NUMA
	int nid;

	if (numa_off)
		return 0;

	if (x86_acpi_numa_init() < 0) {
		nodes_clear(numa_nodes_parsed);
		remove_all_active_ranges();
		return 0;
	}

	for_each_node_mask(nid, numa_nodes_parsed)
		node_set_online(nid);
	sort_node_map();
	return 1;
#else
	return 0;
#endif
}

371 372 373 374 375 376 377 378 379
static void get_memcfg_numa(void)
{
	if (get_memcfg_numaq())
		return;
	if (get_memcfg_from_srat())
		return;
	get_memcfg_numa_flat();
}

380
void __init initmem_init(void)
L
Linus Torvalds 已提交
381 382 383 384
{
	int nid;

	get_memcfg_numa();
385
	numa_init_array();
L
Linus Torvalds 已提交
386

387 388 389 390 391 392 393 394
	for_each_online_node(nid) {
		u64 start = (u64)node_start_pfn[nid] << PAGE_SHIFT;
		u64 end = min((u64)node_end_pfn[nid] << PAGE_SHIFT,
			      (u64)max_pfn << PAGE_SHIFT);

		if (start < end)
			init_alloc_remap(nid, start, end);
	}
395

L
Linus Torvalds 已提交
396 397
#ifdef CONFIG_HIGHMEM
	highstart_pfn = highend_pfn = max_pfn;
398 399
	if (max_pfn > max_low_pfn)
		highstart_pfn = max_low_pfn;
L
Linus Torvalds 已提交
400 401
	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
	       pages_to_mb(highend_pfn - highstart_pfn));
402 403 404
	num_physpages = highend_pfn;
	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
#else
405 406
	num_physpages = max_low_pfn;
	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
L
Linus Torvalds 已提交
407 408
#endif
	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
409
			pages_to_mb(max_low_pfn));
410 411
	printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n",
			max_low_pfn, highstart_pfn);
L
Linus Torvalds 已提交
412

413
	printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
L
Linus Torvalds 已提交
414
			(ulong) pfn_to_kaddr(max_low_pfn));
415
	for_each_online_node(nid)
L
Linus Torvalds 已提交
416
		allocate_pgdat(nid);
417

418
	printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
L
Linus Torvalds 已提交
419 420
			(ulong) pfn_to_kaddr(highstart_pfn));
	for_each_online_node(nid)
421
		propagate_e820_map_node(nid);
L
Linus Torvalds 已提交
422

423
	for_each_online_node(nid) {
424
		memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
Y
Yinghai Lu 已提交
425
		NODE_DATA(nid)->node_id = nid;
426
	}
427

L
Linus Torvalds 已提交
428 429 430
	setup_bootmem_allocator();
}

431
#ifdef CONFIG_MEMORY_HOTPLUG
432
static int paddr_to_nid(u64 addr)
433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457
{
	int nid;
	unsigned long pfn = PFN_DOWN(addr);

	for_each_node(nid)
		if (node_start_pfn[nid] <= pfn &&
		    pfn < node_end_pfn[nid])
			return nid;

	return -1;
}

/*
 * This function is used to ask node id BEFORE memmap and mem_section's
 * initialization (pfn_to_nid() can't be used yet).
 * If _PXM is not defined on ACPI's DSDT, node id must be found by this.
 */
int memory_add_physaddr_to_nid(u64 addr)
{
	int nid = paddr_to_nid(addr);
	return (nid >= 0) ? nid : 0;
}

EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif
458

459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492
/* temporary shim, will go away soon */
int __init numa_add_memblk(int nid, u64 start, u64 end)
{
	unsigned long start_pfn = start >> PAGE_SHIFT;
	unsigned long end_pfn = end >> PAGE_SHIFT;

	printk(KERN_DEBUG "nid %d start_pfn %08lx end_pfn %08lx\n",
	       nid, start_pfn, end_pfn);

	if (start >= (u64)max_pfn << PAGE_SHIFT) {
		printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
		       start_pfn, end_pfn);
		return 0;
	}

	node_set_online(nid);
	memblock_x86_register_active_regions(nid, start_pfn,
					     min(end_pfn, max_pfn));

	if (!node_has_online_mem(nid)) {
		node_start_pfn[nid] = start_pfn;
		node_end_pfn[nid] = end_pfn;
	} else {
		node_start_pfn[nid] = min(node_start_pfn[nid], start_pfn);
		node_end_pfn[nid] = max(node_end_pfn[nid], end_pfn);
	}
	return 0;
}

/* temporary shim, will go away soon */
void __init numa_set_distance(int from, int to, int distance)
{
	/* nada */
}