init_64.c 23.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
24
#include <linux/pci.h>
25
#include <linux/pfn.h>
26
#include <linux/poison.h>
27
#include <linux/dma-mapping.h>
28 29
#include <linux/module.h>
#include <linux/memory_hotplug.h>
30
#include <linux/nmi.h>
L
Linus Torvalds 已提交
31 32 33 34 35 36 37 38 39 40 41 42 43 44

#include <asm/processor.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
45
#include <asm/sections.h>
46
#include <asm/kdebug.h>
47
#include <asm/numa.h>
48
#include <asm/cacheflush.h>
L
Linus Torvalds 已提交
49

50 51 52 53 54 55 56 57 58 59 60 61
/*
 * PFN of last memory page.
 */
unsigned long end_pfn;

/*
 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
 * The direct mapping extends to max_pfn_mapped, so that we can directly access
 * apertures, ACPI and other tables without having to play with fixmaps.
 */
unsigned long max_pfn_mapped;

62 63
static unsigned long dma_reserve __initdata;

L
Linus Torvalds 已提交
64 65
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);

I
Ingo Molnar 已提交
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
int direct_gbpages __meminitdata
#ifdef CONFIG_DIRECT_GBPAGES
				= 1
#endif
;

static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
86 87 88 89 90 91 92 93
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

void show_mem(void)
{
94 95
	long i, total = 0, reserved = 0;
	long shared = 0, cached = 0;
L
Linus Torvalds 已提交
96
	struct page *page;
T
Thomas Gleixner 已提交
97
	pg_data_t *pgdat;
L
Linus Torvalds 已提交
98

99
	printk(KERN_INFO "Mem-info:\n");
L
Linus Torvalds 已提交
100
	show_free_areas();
101
	for_each_online_pgdat(pgdat) {
T
Thomas Gleixner 已提交
102 103 104 105 106 107
		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
			/*
			 * This loop can take a while with 256 GB and
			 * 4k pages so defer the NMI watchdog:
			 */
			if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
108
				touch_nmi_watchdog();
T
Thomas Gleixner 已提交
109

B
Bob Picco 已提交
110 111
			if (!pfn_valid(pgdat->node_start_pfn + i))
				continue;
T
Thomas Gleixner 已提交
112

L
Linus Torvalds 已提交
113 114
			page = pfn_to_page(pgdat->node_start_pfn + i);
			total++;
115 116 117 118 119 120
			if (PageReserved(page))
				reserved++;
			else if (PageSwapCache(page))
				cached++;
			else if (page_count(page))
				shared += page_count(page) - 1;
T
Thomas Gleixner 已提交
121
		}
L
Linus Torvalds 已提交
122
	}
T
Thomas Gleixner 已提交
123 124 125 126
	printk(KERN_INFO "%lu pages of RAM\n",		total);
	printk(KERN_INFO "%lu reserved pages\n",	reserved);
	printk(KERN_INFO "%lu pages shared\n",		shared);
	printk(KERN_INFO "%lu pages swap cached\n",	cached);
L
Linus Torvalds 已提交
127 128 129 130
}

int after_bootmem;

131
static __init void *spp_getpage(void)
T
Thomas Gleixner 已提交
132
{
L
Linus Torvalds 已提交
133
	void *ptr;
T
Thomas Gleixner 已提交
134

L
Linus Torvalds 已提交
135
	if (after_bootmem)
T
Thomas Gleixner 已提交
136
		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
L
Linus Torvalds 已提交
137 138
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
139 140 141 142 143

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
144

145
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
146

L
Linus Torvalds 已提交
147
	return ptr;
T
Thomas Gleixner 已提交
148
}
L
Linus Torvalds 已提交
149

D
Daniel J Blueman 已提交
150
static __init void
T
Thomas Gleixner 已提交
151
set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
L
Linus Torvalds 已提交
152 153 154 155 156 157
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte, new_pte;

158
	pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
L
Linus Torvalds 已提交
159 160 161

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
162 163
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
L
Linus Torvalds 已提交
164 165 166 167
		return;
	}
	pud = pud_offset(pgd, vaddr);
	if (pud_none(*pud)) {
T
Thomas Gleixner 已提交
168
		pmd = (pmd_t *) spp_getpage();
L
Linus Torvalds 已提交
169 170
		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
		if (pmd != pmd_offset(pud, 0)) {
171
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
T
Thomas Gleixner 已提交
172
				pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
173 174 175 176 177 178 179 180
			return;
		}
	}
	pmd = pmd_offset(pud, vaddr);
	if (pmd_none(*pmd)) {
		pte = (pte_t *) spp_getpage();
		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
		if (pte != pte_offset_kernel(pmd, 0)) {
181
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
182 183 184 185 186 187
			return;
		}
	}
	new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);

	pte = pte_offset_kernel(pmd, vaddr);
I
Ingo Molnar 已提交
188
	if (!pte_none(*pte) && pte_val(new_pte) &&
L
Linus Torvalds 已提交
189 190 191 192 193 194 195 196 197 198 199
	    pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
		pte_ERROR(*pte);
	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

200
/*
201 202 203
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
 * We limit the mappings to the region from _text to _end.  _end is
 * rounded up to the 2MB boundary. This catches the invalid pmds as
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
	unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
	pmd_t *pmd = level2_kernel_pgt;
	pmd_t *last_pmd = pmd + PTRS_PER_PMD;

	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
221
		if (pmd_none(*pmd))
222 223 224 225 226 227
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

L
Linus Torvalds 已提交
228
/* NOTE: this is meant to be run only at boot */
D
Daniel J Blueman 已提交
229
void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
L
Linus Torvalds 已提交
230 231 232 233
{
	unsigned long address = __fix_to_virt(idx);

	if (idx >= __end_of_fixed_addresses) {
234
		printk(KERN_ERR "Invalid __set_fixmap\n");
L
Linus Torvalds 已提交
235 236 237 238 239
		return;
	}
	set_pte_phys(address, phys, prot);
}

240 241
static unsigned long __initdata table_start;
static unsigned long __meminitdata table_end;
L
Linus Torvalds 已提交
242

243
static __meminit void *alloc_low_page(unsigned long *phys)
T
Thomas Gleixner 已提交
244
{
245
	unsigned long pfn = table_end++;
L
Linus Torvalds 已提交
246 247
	void *adr;

248 249 250
	if (after_bootmem) {
		adr = (void *)get_zeroed_page(GFP_ATOMIC);
		*phys = __pa(adr);
T
Thomas Gleixner 已提交
251

252 253 254
		return adr;
	}

T
Thomas Gleixner 已提交
255 256
	if (pfn >= end_pfn)
		panic("alloc_low_page: ran out of memory");
257 258

	adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
259
	memset(adr, 0, PAGE_SIZE);
260 261 262
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
L
Linus Torvalds 已提交
263

264
static __meminit void unmap_low_page(void *adr)
T
Thomas Gleixner 已提交
265
{
266 267 268
	if (after_bootmem)
		return;

269
	early_iounmap(adr, PAGE_SIZE);
T
Thomas Gleixner 已提交
270
}
L
Linus Torvalds 已提交
271

272
/* Must run before zap_low_mappings */
273
__meminit void *early_ioremap(unsigned long addr, unsigned long size)
274
{
275
	pmd_t *pmd, *last_pmd;
T
Thomas Gleixner 已提交
276
	unsigned long vaddr;
277 278 279 280 281 282
	int i, pmds;

	pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
	vaddr = __START_KERNEL_map;
	pmd = level2_kernel_pgt;
	last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
T
Thomas Gleixner 已提交
283

284 285 286
	for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
		for (i = 0; i < pmds; i++) {
			if (pmd_present(pmd[i]))
T
Thomas Gleixner 已提交
287
				goto continue_outer_loop;
288 289 290
		}
		vaddr += addr & ~PMD_MASK;
		addr &= PMD_MASK;
T
Thomas Gleixner 已提交
291

292
		for (i = 0; i < pmds; i++, addr += PMD_SIZE)
293
			set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
A
Andi Kleen 已提交
294
		__flush_tlb_all();
T
Thomas Gleixner 已提交
295

296
		return (void *)vaddr;
T
Thomas Gleixner 已提交
297
continue_outer_loop:
298
		;
299
	}
300
	printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size);
T
Thomas Gleixner 已提交
301

302
	return NULL;
303 304
}

T
Thomas Gleixner 已提交
305 306 307
/*
 * To avoid virtual aliases later:
 */
308
__meminit void early_iounmap(void *addr, unsigned long size)
309
{
310 311 312 313 314 315 316
	unsigned long vaddr;
	pmd_t *pmd;
	int i, pmds;

	vaddr = (unsigned long)addr;
	pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
	pmd = level2_kernel_pgt + pmd_index(vaddr);
T
Thomas Gleixner 已提交
317

318 319
	for (i = 0; i < pmds; i++)
		pmd_clear(pmd + i);
T
Thomas Gleixner 已提交
320

A
Andi Kleen 已提交
321
	__flush_tlb_all();
322 323
}

324
static unsigned long __meminit
325
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
326
{
327
	int i = pmd_index(address);
328

329 330
	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
		pmd_t *pmd = pmd_page + pmd_index(address);
331

332
		if (address >= end) {
T
Thomas Gleixner 已提交
333
			if (!after_bootmem) {
334 335
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
T
Thomas Gleixner 已提交
336
			}
337 338
			break;
		}
339 340 341 342

		if (pmd_val(*pmd))
			continue;

343 344
		set_pte((pte_t *)pmd,
			pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
345
	}
346
	return address;
347 348
}

349
static unsigned long __meminit
350 351
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
{
T
Thomas Gleixner 已提交
352
	pmd_t *pmd = pmd_offset(pud, 0);
353 354
	unsigned long last_map_addr;

355
	spin_lock(&init_mm.page_table_lock);
356
	last_map_addr = phys_pmd_init(pmd, address, end);
357 358
	spin_unlock(&init_mm.page_table_lock);
	__flush_tlb_all();
359
	return last_map_addr;
360 361
}

362
static unsigned long __meminit
T
Thomas Gleixner 已提交
363 364
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
{
365
	unsigned long last_map_addr = end;
366
	int i = pud_index(addr);
367

T
Thomas Gleixner 已提交
368
	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
369 370
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
371 372
		pmd_t *pmd;

373
		if (addr >= end)
L
Linus Torvalds 已提交
374 375
			break;

T
Thomas Gleixner 已提交
376 377 378
		if (!after_bootmem &&
				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
			set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
379
			continue;
T
Thomas Gleixner 已提交
380
		}
L
Linus Torvalds 已提交
381

382
		if (pud_val(*pud)) {
383
			if (!pud_large(*pud))
384
				last_map_addr = phys_pmd_update(pud, addr, end);
385 386 387 388 389 390
			continue;
		}

		if (direct_gbpages) {
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
391
			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
392 393 394
			continue;
		}

395
		pmd = alloc_low_page(&pmd_phys);
T
Thomas Gleixner 已提交
396

397
		spin_lock(&init_mm.page_table_lock);
L
Linus Torvalds 已提交
398
		set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
399
		last_map_addr = phys_pmd_init(pmd, addr, end);
400
		spin_unlock(&init_mm.page_table_lock);
T
Thomas Gleixner 已提交
401

402
		unmap_low_page(pmd);
L
Linus Torvalds 已提交
403
	}
A
Andi Kleen 已提交
404
	__flush_tlb_all();
405 406

	return last_map_addr >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
407
}
L
Linus Torvalds 已提交
408 409 410

static void __init find_early_table_space(unsigned long end)
{
411
	unsigned long puds, pmds, tables, start;
L
Linus Torvalds 已提交
412 413

	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
414 415 416 417 418
	tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
	if (!direct_gbpages) {
		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
		tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
	}
L
Linus Torvalds 已提交
419

T
Thomas Gleixner 已提交
420 421 422 423 424 425
	/*
	 * RED-PEN putting page tables only on node 0 could
	 * cause a hotspot and fill up ZONE_DMA. The page tables
	 * need roughly 0.5KB per GB.
	 */
	start = 0x8000;
426
	table_start = find_e820_area(start, end, tables, PAGE_SIZE);
L
Linus Torvalds 已提交
427 428 429 430 431
	if (table_start == -1UL)
		panic("Cannot find space for the kernel page tables");

	table_start >>= PAGE_SHIFT;
	table_end = table_start;
432 433

	early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
434 435
		end, table_start << PAGE_SHIFT,
		(table_start << PAGE_SHIFT) + tables);
L
Linus Torvalds 已提交
436 437
}

438 439 440 441 442 443 444 445
static void __init init_gbpages(void)
{
	if (direct_gbpages && cpu_has_gbpages)
		printk(KERN_INFO "Using GB pages for direct mapping\n");
	else
		direct_gbpages = 0;
}

Y
Yinghai Lu 已提交
446 447 448 449
#ifdef CONFIG_MEMTEST_BOOTPARAM

static void __init memtest(unsigned long start_phys, unsigned long size,
				 unsigned pattern)
Y
Yinghai Lu 已提交
450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491
{
	unsigned long i;
	unsigned long *start;
	unsigned long start_bad;
	unsigned long last_bad;
	unsigned long val;
	unsigned long start_phys_aligned;
	unsigned long count;
	unsigned long incr;

	switch (pattern) {
	case 0:
		val = 0UL;
		break;
	case 1:
		val = -1UL;
		break;
	case 2:
		val = 0x5555555555555555UL;
		break;
	case 3:
		val = 0xaaaaaaaaaaaaaaaaUL;
		break;
	default:
		return;
	}

	incr = sizeof(unsigned long);
	start_phys_aligned = ALIGN(start_phys, incr);
	count = (size - (start_phys_aligned - start_phys))/incr;
	start = __va(start_phys_aligned);
	start_bad = 0;
	last_bad = 0;

	for (i = 0; i < count; i++)
		start[i] = val;
	for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
		if (*start != val) {
			if (start_phys_aligned == last_bad + incr) {
				last_bad += incr;
			} else {
				if (start_bad) {
Y
Yinghai Lu 已提交
492
					printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
Y
Yinghai Lu 已提交
493 494 495 496 497 498 499 500
						val, start_bad, last_bad + incr);
					reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
				}
				start_bad = last_bad = start_phys_aligned;
			}
		}
	}
	if (start_bad) {
Y
Yinghai Lu 已提交
501
		printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
Y
Yinghai Lu 已提交
502 503 504 505 506 507
			val, start_bad, last_bad + incr);
		reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
	}

}

Y
Yinghai Lu 已提交
508 509
static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;

Y
Yinghai Lu 已提交
510 511 512
static int __init parse_memtest(char *arg)
{
	if (arg)
Y
Yinghai Lu 已提交
513
		memtest_pattern = simple_strtoul(arg, NULL, 0);
Y
Yinghai Lu 已提交
514 515 516 517 518 519 520
	return 0;
}

early_param("memtest", parse_memtest);

static void __init early_memtest(unsigned long start, unsigned long end)
{
521
	u64 t_start, t_size;
Y
Yinghai Lu 已提交
522 523
	unsigned pattern;

Y
Yinghai Lu 已提交
524 525 526 527
	if (!memtest_pattern)
		return;

	printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
Y
Yinghai Lu 已提交
528 529 530 531 532 533 534 535 536 537 538 539
	for (pattern = 0; pattern < memtest_pattern; pattern++) {
		t_start = start;
		t_size = 0;
		while (t_start < end) {
			t_start = find_e820_area_size(t_start, &t_size, 1);

			/* done ? */
			if (t_start >= end)
				break;
			if (t_start + t_size > end)
				t_size = end - t_start;

540 541 542
			printk(KERN_CONT "\n  %016llx - %016llx pattern %d",
				(unsigned long long)t_start,
				(unsigned long long)t_start + t_size, pattern);
Y
Yinghai Lu 已提交
543 544 545 546 547 548

			memtest(t_start, t_size, pattern);

			t_start += t_size;
		}
	}
Y
Yinghai Lu 已提交
549
	printk(KERN_CONT "\n");
Y
Yinghai Lu 已提交
550
}
Y
Yinghai Lu 已提交
551 552 553 554 555
#else
static void __init early_memtest(unsigned long start, unsigned long end)
{
}
#endif
Y
Yinghai Lu 已提交
556

T
Thomas Gleixner 已提交
557 558 559 560 561
/*
 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
 * This runs before bootmem is initialized and gets pages directly from
 * the physical memory. To access them they are temporarily mapped.
 */
562
unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
T
Thomas Gleixner 已提交
563
{
564
	unsigned long next, last_map_addr = end;
Y
Yinghai Lu 已提交
565
	unsigned long start_phys = start, end_phys = end;
L
Linus Torvalds 已提交
566

Y
Yinghai Lu 已提交
567
	printk(KERN_INFO "init_memory_mapping\n");
L
Linus Torvalds 已提交
568

T
Thomas Gleixner 已提交
569
	/*
L
Linus Torvalds 已提交
570
	 * Find space for the kernel direct mapping tables.
T
Thomas Gleixner 已提交
571 572 573 574
	 *
	 * Later we should allocate these tables in the local node of the
	 * memory mapped. Unfortunately this is done currently before the
	 * nodes are discovered.
L
Linus Torvalds 已提交
575
	 */
576 577
	if (!after_bootmem) {
		init_gbpages();
578
		find_early_table_space(end);
579
	}
L
Linus Torvalds 已提交
580 581 582 583 584

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);

	for (; start < end; start = next) {
585
		pgd_t *pgd = pgd_offset_k(start);
T
Thomas Gleixner 已提交
586
		unsigned long pud_phys;
587 588 589
		pud_t *pud;

		if (after_bootmem)
590
			pud = pud_offset(pgd, start & PGDIR_MASK);
591
		else
592
			pud = alloc_low_page(&pud_phys);
593

L
Linus Torvalds 已提交
594
		next = start + PGDIR_SIZE;
T
Thomas Gleixner 已提交
595 596
		if (next > end)
			next = end;
597
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
598 599
		if (!after_bootmem)
			set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
600
		unmap_low_page(pud);
T
Thomas Gleixner 已提交
601
	}
L
Linus Torvalds 已提交
602

603
	if (!after_bootmem)
604
		mmu_cr4_features = read_cr4();
L
Linus Torvalds 已提交
605
	__flush_tlb_all();
606

607 608 609
	if (!after_bootmem)
		reserve_early(table_start << PAGE_SHIFT,
				 table_end << PAGE_SHIFT, "PGTABLE");
Y
Yinghai Lu 已提交
610 611 612

	if (!after_bootmem)
		early_memtest(start_phys, end_phys);
613 614

	return last_map_addr;
L
Linus Torvalds 已提交
615 616
}

617
#ifndef CONFIG_NUMA
L
Linus Torvalds 已提交
618 619
void __init paging_init(void)
{
620
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
621

622 623 624 625 626
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
	max_zone_pfns[ZONE_NORMAL] = end_pfn;

627 628
	memory_present(0, 0, end_pfn);
	sparse_init();
629
	free_area_init_nodes(max_zone_pfns);
L
Linus Torvalds 已提交
630 631 632
}
#endif

633 634 635
/*
 * Memory hotplug specific functions
 */
636
#ifdef CONFIG_MEMORY_HOTPLUG
637 638 639 640
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
641
int arch_add_memory(int nid, u64 start, u64 size)
642
{
643
	struct pglist_data *pgdat = NODE_DATA(nid);
644
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
645
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
646 647 648
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

649 650 651
	last_mapped_pfn = init_memory_mapping(start, start + size-1);
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
652

653
	ret = __add_pages(zone, start_pfn, nr_pages);
654
	WARN_ON(1);
655 656 657

	return ret;
}
658
EXPORT_SYMBOL_GPL(arch_add_memory);
659

660
#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
661 662 663 664
int memory_add_physaddr_to_nid(u64 start)
{
	return 0;
}
665
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
666 667
#endif

668 669
#endif /* CONFIG_MEMORY_HOTPLUG */

670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689
/*
 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
 * is valid. The argument is a physical page number.
 *
 *
 * On x86, access has to be given to the first megabyte of ram because that area
 * contains bios code and data regions used by X and dosemu and similar apps.
 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
 * mmio resources as well as potential bios/acpi data regions.
 */
int devmem_is_allowed(unsigned long pagenr)
{
	if (pagenr <= 256)
		return 1;
	if (!page_is_ram(pagenr))
		return 1;
	return 0;
}


T
Thomas Gleixner 已提交
690 691
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
			 kcore_modules, kcore_vsyscall;
L
Linus Torvalds 已提交
692 693 694

void __init mem_init(void)
{
695
	long codesize, reservedpages, datasize, initsize;
L
Linus Torvalds 已提交
696

697
	pci_iommu_alloc();
L
Linus Torvalds 已提交
698

699
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
700 701 702 703

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
704
#ifdef CONFIG_NUMA
705
	totalram_pages = numa_free_all_bootmem();
L
Linus Torvalds 已提交
706
#else
707
	totalram_pages = free_all_bootmem();
L
Linus Torvalds 已提交
708
#endif
709 710
	reservedpages = end_pfn - totalram_pages -
					absent_pages_in_range(0, end_pfn);
L
Linus Torvalds 已提交
711 712 713 714 715 716 717
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
718 719
	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
L
Linus Torvalds 已提交
720 721 722
		   VMALLOC_END-VMALLOC_START);
	kclist_add(&kcore_kernel, &_stext, _end - _stext);
	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
T
Thomas Gleixner 已提交
723
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
L
Linus Torvalds 已提交
724 725
				 VSYSCALL_END - VSYSCALL_START);

726
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
T
Thomas Gleixner 已提交
727
				"%ldk reserved, %ldk data, %ldk init)\n",
L
Linus Torvalds 已提交
728 729 730 731 732 733
		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
		end_pfn << (PAGE_SHIFT-10),
		codesize >> 10,
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
734 735

	cpa_init();
L
Linus Torvalds 已提交
736 737
}

738
void free_init_pages(char *what, unsigned long begin, unsigned long end)
L
Linus Torvalds 已提交
739
{
740
	unsigned long addr = begin;
L
Linus Torvalds 已提交
741

742
	if (addr >= end)
743 744
		return;

I
Ingo Molnar 已提交
745 746 747 748 749 750 751 752 753 754
	/*
	 * If debugging page accesses then do not free this memory but
	 * mark them not present - any buggy init-section access will
	 * create a kernel page fault:
	 */
#ifdef CONFIG_DEBUG_PAGEALLOC
	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
		begin, PAGE_ALIGN(end));
	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
#else
755
	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
T
Thomas Gleixner 已提交
756

757
	for (; addr < end; addr += PAGE_SIZE) {
758 759 760 761 762
		ClearPageReserved(virt_to_page(addr));
		init_page_count(virt_to_page(addr));
		memset((void *)(addr & ~(PAGE_SIZE-1)),
			POISON_FREE_INITMEM, PAGE_SIZE);
		free_page(addr);
L
Linus Torvalds 已提交
763 764
		totalram_pages++;
	}
I
Ingo Molnar 已提交
765
#endif
766 767 768 769 770
}

void free_initmem(void)
{
	free_init_pages("unused kernel memory",
771 772
			(unsigned long)(&__init_begin),
			(unsigned long)(&__init_end));
L
Linus Torvalds 已提交
773 774
}

775
#ifdef CONFIG_DEBUG_RODATA
776 777
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
778 779 780

void mark_rodata_ro(void)
{
781
	unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
782

783
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
784
	       (end - start) >> 10);
785 786 787 788 789 790 791 792
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
	start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
	set_memory_nx(start, (end - start) >> PAGE_SHIFT);
793

794 795
	rodata_test();

796
#ifdef CONFIG_CPA_DEBUG
797
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
798
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
799

800
	printk(KERN_INFO "Testing CPA: again\n");
801
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
802
#endif
803
}
804

805 806
#endif

L
Linus Torvalds 已提交
807 808 809
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
810
	free_init_pages("initrd memory", start, end);
L
Linus Torvalds 已提交
811 812 813
}
#endif

814 815
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
				   int flags)
T
Thomas Gleixner 已提交
816
{
817
#ifdef CONFIG_NUMA
818
	int nid, next_nid;
819 820
#endif
	unsigned long pfn = phys >> PAGE_SHIFT;
821
	int ret;
T
Thomas Gleixner 已提交
822

823
	if (pfn >= end_pfn) {
T
Thomas Gleixner 已提交
824 825 826 827
		/*
		 * This can happen with kdump kernels when accessing
		 * firmware tables:
		 */
828
		if (pfn < max_pfn_mapped)
829
			return -EFAULT;
T
Thomas Gleixner 已提交
830

831 832
		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
				phys, len);
833
		return -EFAULT;
834 835 836 837
	}

	/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
838 839 840
	nid = phys_to_nid(phys);
	next_nid = phys_to_nid(phys + len - 1);
	if (nid == next_nid)
841
		ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
842
	else
843 844 845 846 847
		ret = reserve_bootmem(phys, len, flags);

	if (ret != 0)
		return ret;

T
Thomas Gleixner 已提交
848
#else
849
	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
L
Linus Torvalds 已提交
850
#endif
851

852
	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
853
		dma_reserve += len / PAGE_SIZE;
854 855
		set_dma_reserve(dma_reserve);
	}
856 857

	return 0;
L
Linus Torvalds 已提交
858 859
}

T
Thomas Gleixner 已提交
860 861
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
862
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
863 864 865 866
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
867 868

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
869 870
		return 0;

L
Linus Torvalds 已提交
871 872 873 874 875 876
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
877
		return 0;
L
Linus Torvalds 已提交
878 879 880 881

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
882

L
Linus Torvalds 已提交
883 884 885 886 887 888
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
889

L
Linus Torvalds 已提交
890 891 892
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
893 894 895 896 897
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
898
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
899 900 901 902
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
903 904 905 906 907
};

struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
#ifdef CONFIG_IA32_EMULATION
908 909
	if (test_tsk_thread_flag(tsk, TIF_IA32))
		return NULL;
L
Linus Torvalds 已提交
910 911 912 913 914 915 916
#endif
	return &gate_vma;
}

int in_gate_area(struct task_struct *task, unsigned long addr)
{
	struct vm_area_struct *vma = get_gate_vma(task);
T
Thomas Gleixner 已提交
917

918 919
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
920

L
Linus Torvalds 已提交
921 922 923
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
924 925 926 927
/*
 * Use this when you have no reliable task/vma, typically from interrupt
 * context. It is less reliable than using the task's vma and may give
 * false positives:
L
Linus Torvalds 已提交
928 929 930
 */
int in_gate_area_no_task(unsigned long addr)
{
931
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
932
}
933

934 935 936 937 938 939 940 941
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
942 943 944 945 946

#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
947 948 949 950
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

T
Thomas Gleixner 已提交
951 952
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
953 954 955 956 957 958 959 960 961 962 963 964 965 966
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
		next = pmd_addr_end(addr, end);

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
967

968 969 970 971 972 973 974
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

		pmd = pmd_offset(pud, addr);
		if (pmd_none(*pmd)) {
			pte_t entry;
T
Thomas Gleixner 已提交
975 976 977
			void *p;

			p = vmemmap_alloc_block(PMD_SIZE, node);
978 979 980
			if (!p)
				return -ENOMEM;

T
Thomas Gleixner 已提交
981 982
			entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
							PAGE_KERNEL_LARGE);
983 984
			set_pmd(pmd, __pmd(pte_val(entry)));

985 986 987 988 989 990 991 992 993 994 995
			/* check to see if we have contiguous blocks */
			if (p_end != p || node_start != node) {
				if (p_start)
					printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						addr_start, addr_end-1, p_start, p_end-1, node_start);
				addr_start = addr;
				node_start = node;
				p_start = p;
			}
			addr_end = addr + PMD_SIZE;
			p_end = p + PMD_SIZE;
T
Thomas Gleixner 已提交
996
		} else {
997
			vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
998
		}
999 1000 1001
	}
	return 0;
}
1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012

void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
1013
#endif