numa_32.c 13.1 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation
 * August 2002: added remote node KVA remap - Martin J. Bligh 
 *
 * Copyright (C) 2002, IBM Corp.
 *
 * All rights reserved.          
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
 * NON INFRINGEMENT.  See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

#include <linux/mm.h>
#include <linux/bootmem.h>
27
#include <linux/memblock.h>
L
Linus Torvalds 已提交
28 29 30 31
#include <linux/mmzone.h>
#include <linux/highmem.h>
#include <linux/initrd.h>
#include <linux/nodemask.h>
32
#include <linux/module.h>
33
#include <linux/kexec.h>
D
Dave Hansen 已提交
34
#include <linux/pfn.h>
35
#include <linux/swap.h>
M
Mel Gorman 已提交
36
#include <linux/acpi.h>
37

L
Linus Torvalds 已提交
38 39 40
#include <asm/e820.h>
#include <asm/setup.h>
#include <asm/mmzone.h>
41
#include <asm/bios_ebda.h>
42
#include <asm/proto.h>
L
Linus Torvalds 已提交
43

44
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
45
EXPORT_SYMBOL(node_data);
L
Linus Torvalds 已提交
46 47

/*
A
Adrian Bunk 已提交
48
 * numa interface - we expect the numa architecture specific code to have
L
Linus Torvalds 已提交
49 50 51
 *                  populated the following initialisation.
 *
 * 1) node_online_map  - the map of all nodes configured (online) in the system
52
 * 2) node_start_pfn   - the starting page frame number for a node
L
Linus Torvalds 已提交
53 54
 * 3) node_end_pfn     - the ending page fram number for a node
 */
55 56
unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
57

L
Linus Torvalds 已提交
58

59
#ifdef CONFIG_DISCONTIGMEM
L
Linus Torvalds 已提交
60
/*
61
 * 4) physnode_map     - the mapping between a pfn and owning node
L
Linus Torvalds 已提交
62
 * physnode_map keeps track of the physical memory layout of a generic
63 64
 * numa node on a 64Mb break (each element of the array will
 * represent 64Mb of memory and will be marked by the node id.  so,
L
Linus Torvalds 已提交
65 66 67
 * if the first gig is on node 0, and the second gig is on node 1
 * physnode_map will contain:
 *
68 69 70
 *     physnode_map[0-15] = 0;
 *     physnode_map[16-31] = 1;
 *     physnode_map[32- ] = -1;
L
Linus Torvalds 已提交
71
 */
72
s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
73
EXPORT_SYMBOL(physnode_map);
L
Linus Torvalds 已提交
74 75 76 77 78

void memory_present(int nid, unsigned long start, unsigned long end)
{
	unsigned long pfn;

79
	printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n",
L
Linus Torvalds 已提交
80 81 82 83 84
			nid, start, end);
	printk(KERN_DEBUG "  Setting physnode_map array to node %d for pfns:\n", nid);
	printk(KERN_DEBUG "  ");
	for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
		physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
85
		printk(KERN_CONT "%lx ", pfn);
L
Linus Torvalds 已提交
86
	}
87
	printk(KERN_CONT "\n");
L
Linus Torvalds 已提交
88 89 90 91 92 93 94 95 96 97 98 99
}

unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
					      unsigned long end_pfn)
{
	unsigned long nr_pages = end_pfn - start_pfn;

	if (!nr_pages)
		return 0;

	return (nr_pages + 1) * sizeof(struct page);
}
100
#endif
L
Linus Torvalds 已提交
101 102 103 104 105 106 107

extern unsigned long find_max_low_pfn(void);
extern unsigned long highend_pfn, highstart_pfn;

#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)

unsigned long node_remap_size[MAX_NUMNODES];
A
Adrian Bunk 已提交
108
static void *node_remap_start_vaddr[MAX_NUMNODES];
L
Linus Torvalds 已提交
109 110
void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);

111 112
static unsigned long kva_start_pfn;
static unsigned long kva_pages;
113 114 115 116 117 118

int __cpuinit numa_cpu_node(int cpu)
{
	return apic->x86_32_numa_cpu_node(cpu);
}

L
Linus Torvalds 已提交
119 120 121 122 123 124 125
/*
 * FLAT - support for basic PC memory model with discontig enabled, essentially
 *        a single node with all available processors in it with a flat
 *        memory map.
 */
int __init get_memcfg_numa_flat(void)
{
126
	printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
L
Linus Torvalds 已提交
127 128 129

	node_start_pfn[0] = 0;
	node_end_pfn[0] = max_pfn;
130
	memblock_x86_register_active_regions(0, 0, max_pfn);
L
Linus Torvalds 已提交
131
	memory_present(0, 0, max_pfn);
132
	node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
L
Linus Torvalds 已提交
133 134 135 136 137 138 139 140 141 142

        /* Indicate there is one node available. */
	nodes_clear(node_online_map);
	node_set_online(0);
	return 1;
}

/*
 * Find the highest page frame number we have available for the node
 */
143
static void __init propagate_e820_map_node(int nid)
L
Linus Torvalds 已提交
144 145 146 147 148 149 150 151 152
{
	if (node_end_pfn[nid] > max_pfn)
		node_end_pfn[nid] = max_pfn;
	/*
	 * if a user has given mem=XXXX, then we need to make sure 
	 * that the node _starts_ before that, too, not just ends
	 */
	if (node_start_pfn[nid] > max_pfn)
		node_start_pfn[nid] = max_pfn;
E
Eric Sesterhenn 已提交
153
	BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]);
L
Linus Torvalds 已提交
154 155 156 157 158 159 160 161 162 163 164
}

/* 
 * Allocate memory for the pg_data_t for this node via a crude pre-bootmem
 * method.  For node zero take this from the bottom of memory, for
 * subsequent nodes place them at node_remap_start_vaddr which contains
 * node local data in physically node local memory.  See setup_memory()
 * for details.
 */
static void __init allocate_pgdat(int nid)
{
Y
Yinghai Lu 已提交
165 166 167
	char buf[16];

	if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
L
Linus Torvalds 已提交
168 169
		NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
	else {
170
		unsigned long pgdat_phys;
171
		pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT,
Y
Yinghai Lu 已提交
172
				 max_pfn_mapped<<PAGE_SHIFT,
173
				 sizeof(pg_data_t),
174 175
				 PAGE_SIZE);
		NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
Y
Yinghai Lu 已提交
176 177
		memset(buf, 0, sizeof(buf));
		sprintf(buf, "NODE_DATA %d",  nid);
178
		memblock_x86_reserve_range(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
L
Linus Torvalds 已提交
179
	}
180 181
	printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
		nid, (unsigned long)NODE_DATA(nid));
L
Linus Torvalds 已提交
182 183
}

M
Mel Gorman 已提交
184
/*
185 186 187 188 189 190
 * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel
 * virtual address space (KVA) is reserved and portions of nodes are mapped
 * using it. This is to allow node-local memory to be allocated for
 * structures that would normally require ZONE_NORMAL. The memory is
 * allocated with alloc_remap() and callers should be prepared to allocate
 * from the bootmem allocator instead.
M
Mel Gorman 已提交
191 192 193 194 195 196
 */
static unsigned long node_remap_start_pfn[MAX_NUMNODES];
static void *node_remap_end_vaddr[MAX_NUMNODES];
static void *node_remap_alloc_vaddr[MAX_NUMNODES];
static unsigned long node_remap_offset[MAX_NUMNODES];

197 198 199 200 201 202
void *alloc_remap(int nid, unsigned long size)
{
	void *allocation = node_remap_alloc_vaddr[nid];

	size = ALIGN(size, L1_CACHE_BYTES);

203
	if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
204
		return NULL;
205 206 207 208 209 210 211

	node_remap_alloc_vaddr[nid] += size;
	memset(allocation, 0, size);

	return allocation;
}

212
static void __init remap_numa_kva(void)
L
Linus Torvalds 已提交
213 214 215 216 217 218
{
	void *vaddr;
	unsigned long pfn;
	int node;

	for_each_online_node(node) {
219
		printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
L
Linus Torvalds 已提交
220 221
		for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
			vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
222 223 224
			printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
				(unsigned long)vaddr,
				node_remap_start_pfn[node] + pfn);
L
Linus Torvalds 已提交
225 226 227 228 229 230 231
			set_pmd_pfn((ulong) vaddr, 
				node_remap_start_pfn[node] + pfn, 
				PAGE_KERNEL_LARGE);
		}
	}
}

232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
#ifdef CONFIG_HIBERNATION
/**
 * resume_map_numa_kva - add KVA mapping to the temporary page tables created
 *                       during resume from hibernation
 * @pgd_base - temporary resume page directory
 */
void resume_map_numa_kva(pgd_t *pgd_base)
{
	int node;

	for_each_online_node(node) {
		unsigned long start_va, start_pfn, size, pfn;

		start_va = (unsigned long)node_remap_start_vaddr[node];
		start_pfn = node_remap_start_pfn[node];
		size = node_remap_size[node];

249
		printk(KERN_DEBUG "%s: node %d\n", __func__, node);
250 251 252 253 254 255 256 257 258 259 260

		for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) {
			unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
			pgd_t *pgd = pgd_base + pgd_index(vaddr);
			pud_t *pud = pud_offset(pgd, vaddr);
			pmd_t *pmd = pmd_offset(pud, vaddr);

			set_pmd(pmd, pfn_pmd(start_pfn + pfn,
						PAGE_KERNEL_LARGE_EXEC));

			printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n",
261
				__func__, vaddr, start_pfn + pfn);
262 263 264 265 266
		}
	}
}
#endif

267
static __init unsigned long init_alloc_remap(int nid, unsigned long offset)
L
Linus Torvalds 已提交
268
{
269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
	unsigned long size;
	u64 node_kva;

	/*
	 * The acpi/srat node info can show hot-add memroy zones where
	 * memory could be added but not currently present.
	 */
	printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
	       nid, node_start_pfn[nid], node_end_pfn[nid]);
	if (node_start_pfn[nid] > max_pfn)
		return 0;
	if (!node_end_pfn[nid])
		return 0;
	if (node_end_pfn[nid] > max_pfn)
		node_end_pfn[nid] = max_pfn;

	/* ensure the remap includes space for the pgdat. */
	size = node_remap_size[nid];
	size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);

	/* convert size to large (pmd size) pages, rounding up */
	size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
	/* now the roundup is correct, convert to PAGE_SIZE pages */
	size = size * PTRS_PER_PTE;

	node_kva = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT,
					  (u64)node_end_pfn[nid] << PAGE_SHIFT,
					  (u64)size << PAGE_SHIFT,
					  LARGE_PAGE_BYTES);
	if (node_kva == MEMBLOCK_ERROR)
		panic("Can not get kva ram\n");

	node_remap_size[nid] = size;
	node_remap_offset[nid] = offset;
	printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of node %d at %llx\n",
	       size, nid, node_kva >> PAGE_SHIFT);

	/*
	 *  prevent kva address below max_low_pfn want it on system
	 *  with less memory later.
	 *  layout will be: KVA address , KVA RAM
	 *
	 *  we are supposed to only record the one less then
	 *  max_low_pfn but we could have some hole in high memory,
	 *  and it will only check page_is_ram(pfn) &&
	 *  !page_is_reserved_early(pfn) to decide to use it as free.
	 *  So memblock_x86_reserve_range here, hope we don't run out
	 *  of that array
	 */
	memblock_x86_reserve_range(node_kva,
				   node_kva + ((u64)size << PAGE_SHIFT),
				   "KVA RAM");

	node_remap_start_pfn[nid] = node_kva >> PAGE_SHIFT;

	return size;
L
Linus Torvalds 已提交
325 326
}

M
Mel Gorman 已提交
327 328 329 330 331 332 333 334 335
static void init_remap_allocator(int nid)
{
	node_remap_start_vaddr[nid] = pfn_to_kaddr(
			kva_start_pfn + node_remap_offset[nid]);
	node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
		(node_remap_size[nid] * PAGE_SIZE);
	node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
		ALIGN(sizeof(pg_data_t), PAGE_SIZE);

336
	printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid,
M
Mel Gorman 已提交
337
		(ulong) node_remap_start_vaddr[nid],
338
		(ulong) node_remap_end_vaddr[nid]);
M
Mel Gorman 已提交
339 340
}

341
void __init initmem_init(void)
L
Linus Torvalds 已提交
342
{
343
	unsigned long reserve_pages = 0;
L
Linus Torvalds 已提交
344 345 346 347 348 349
	int nid;

	/*
	 * When mapping a NUMA machine we allocate the node_mem_map arrays
	 * from node local memory.  They are then mapped directly into KVA
	 * between zone normal and vmalloc space.  Calculate the size of
S
Simon Arlott 已提交
350
	 * this space and use it to adjust the boundary between ZONE_NORMAL
L
Linus Torvalds 已提交
351 352
	 * and ZONE_HIGHMEM.
	 */
353

L
Linus Torvalds 已提交
354
	get_memcfg_numa();
355
	numa_init_array();
L
Linus Torvalds 已提交
356

357 358 359 360 361
	for_each_online_node(nid)
		reserve_pages += init_alloc_remap(nid, reserve_pages);
	kva_pages = roundup(reserve_pages, PTRS_PER_PTE);
	printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
			reserve_pages);
L
Linus Torvalds 已提交
362

363 364 365 366
	kva_start_pfn = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
				max_low_pfn << PAGE_SHIFT,
				kva_pages << PAGE_SHIFT,
				PTRS_PER_PTE << PAGE_SHIFT) >> PAGE_SHIFT;
367
	if (kva_start_pfn == MEMBLOCK_ERROR)
368
		panic("Can not get kva space\n");
369

370
	printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
371
		kva_start_pfn, max_low_pfn);
372
	printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
373 374

	/* avoid clash with initrd */
375
	memblock_x86_reserve_range(kva_start_pfn<<PAGE_SHIFT,
376 377
		      (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
		     "KVA PG");
L
Linus Torvalds 已提交
378 379
#ifdef CONFIG_HIGHMEM
	highstart_pfn = highend_pfn = max_pfn;
380 381
	if (max_pfn > max_low_pfn)
		highstart_pfn = max_low_pfn;
L
Linus Torvalds 已提交
382 383
	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
	       pages_to_mb(highend_pfn - highstart_pfn));
384 385 386
	num_physpages = highend_pfn;
	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
#else
387 388
	num_physpages = max_low_pfn;
	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
L
Linus Torvalds 已提交
389 390
#endif
	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
391
			pages_to_mb(max_low_pfn));
392 393
	printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n",
			max_low_pfn, highstart_pfn);
L
Linus Torvalds 已提交
394

395
	printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
L
Linus Torvalds 已提交
396 397
			(ulong) pfn_to_kaddr(max_low_pfn));
	for_each_online_node(nid) {
M
Mel Gorman 已提交
398
		init_remap_allocator(nid);
399

L
Linus Torvalds 已提交
400 401
		allocate_pgdat(nid);
	}
402 403
	remap_numa_kva();

404
	printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
L
Linus Torvalds 已提交
405 406
			(ulong) pfn_to_kaddr(highstart_pfn));
	for_each_online_node(nid)
407
		propagate_e820_map_node(nid);
L
Linus Torvalds 已提交
408

409
	for_each_online_node(nid) {
410
		memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
Y
Yinghai Lu 已提交
411
		NODE_DATA(nid)->node_id = nid;
412
	}
413

L
Linus Torvalds 已提交
414 415 416
	setup_bootmem_allocator();
}

417
#ifdef CONFIG_MEMORY_HOTPLUG
418
static int paddr_to_nid(u64 addr)
419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443
{
	int nid;
	unsigned long pfn = PFN_DOWN(addr);

	for_each_node(nid)
		if (node_start_pfn[nid] <= pfn &&
		    pfn < node_end_pfn[nid])
			return nid;

	return -1;
}

/*
 * This function is used to ask node id BEFORE memmap and mem_section's
 * initialization (pfn_to_nid() can't be used yet).
 * If _PXM is not defined on ACPI's DSDT, node id must be found by this.
 */
int memory_add_physaddr_to_nid(u64 addr)
{
	int nid = paddr_to_nid(addr);
	return (nid >= 0) ? nid : 0;
}

EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif
444