init_64.c 22.2 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
24
#include <linux/pci.h>
25
#include <linux/pfn.h>
26
#include <linux/poison.h>
27
#include <linux/dma-mapping.h>
28 29
#include <linux/module.h>
#include <linux/memory_hotplug.h>
30
#include <linux/nmi.h>
L
Linus Torvalds 已提交
31 32 33 34 35 36 37 38 39 40 41 42 43 44

#include <asm/processor.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
45
#include <asm/sections.h>
46
#include <asm/kdebug.h>
47
#include <asm/numa.h>
48
#include <asm/cacheflush.h>
L
Linus Torvalds 已提交
49

50 51
static unsigned long dma_reserve __initdata;

L
Linus Torvalds 已提交
52 53
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);

I
Ingo Molnar 已提交
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
int direct_gbpages __meminitdata
#ifdef CONFIG_DIRECT_GBPAGES
				= 1
#endif
;

static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
74 75 76 77 78 79 80 81
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

void show_mem(void)
{
82 83
	long i, total = 0, reserved = 0;
	long shared = 0, cached = 0;
L
Linus Torvalds 已提交
84
	struct page *page;
T
Thomas Gleixner 已提交
85
	pg_data_t *pgdat;
L
Linus Torvalds 已提交
86

87
	printk(KERN_INFO "Mem-info:\n");
L
Linus Torvalds 已提交
88
	show_free_areas();
89
	for_each_online_pgdat(pgdat) {
T
Thomas Gleixner 已提交
90 91 92 93 94 95
		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
			/*
			 * This loop can take a while with 256 GB and
			 * 4k pages so defer the NMI watchdog:
			 */
			if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
96
				touch_nmi_watchdog();
T
Thomas Gleixner 已提交
97

B
Bob Picco 已提交
98 99
			if (!pfn_valid(pgdat->node_start_pfn + i))
				continue;
T
Thomas Gleixner 已提交
100

L
Linus Torvalds 已提交
101 102
			page = pfn_to_page(pgdat->node_start_pfn + i);
			total++;
103 104 105 106 107 108
			if (PageReserved(page))
				reserved++;
			else if (PageSwapCache(page))
				cached++;
			else if (page_count(page))
				shared += page_count(page) - 1;
T
Thomas Gleixner 已提交
109
		}
L
Linus Torvalds 已提交
110
	}
T
Thomas Gleixner 已提交
111 112 113 114
	printk(KERN_INFO "%lu pages of RAM\n",		total);
	printk(KERN_INFO "%lu reserved pages\n",	reserved);
	printk(KERN_INFO "%lu pages shared\n",		shared);
	printk(KERN_INFO "%lu pages swap cached\n",	cached);
L
Linus Torvalds 已提交
115 116 117 118
}

int after_bootmem;

119
static __init void *spp_getpage(void)
T
Thomas Gleixner 已提交
120
{
L
Linus Torvalds 已提交
121
	void *ptr;
T
Thomas Gleixner 已提交
122

L
Linus Torvalds 已提交
123
	if (after_bootmem)
T
Thomas Gleixner 已提交
124
		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
L
Linus Torvalds 已提交
125 126
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
127 128 129 130 131

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
132

133
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
134

L
Linus Torvalds 已提交
135
	return ptr;
T
Thomas Gleixner 已提交
136
}
L
Linus Torvalds 已提交
137

T
Thomas Gleixner 已提交
138 139
static __init void
set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
L
Linus Torvalds 已提交
140 141 142 143 144 145
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte, new_pte;

146
	pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
L
Linus Torvalds 已提交
147 148 149

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
150 151
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
L
Linus Torvalds 已提交
152 153 154 155
		return;
	}
	pud = pud_offset(pgd, vaddr);
	if (pud_none(*pud)) {
T
Thomas Gleixner 已提交
156
		pmd = (pmd_t *) spp_getpage();
L
Linus Torvalds 已提交
157 158
		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
		if (pmd != pmd_offset(pud, 0)) {
159
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
T
Thomas Gleixner 已提交
160
				pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
161 162 163 164 165 166 167 168
			return;
		}
	}
	pmd = pmd_offset(pud, vaddr);
	if (pmd_none(*pmd)) {
		pte = (pte_t *) spp_getpage();
		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
		if (pte != pte_offset_kernel(pmd, 0)) {
169
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
			return;
		}
	}
	new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);

	pte = pte_offset_kernel(pmd, vaddr);
	if (!pte_none(*pte) &&
	    pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
		pte_ERROR(*pte);
	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

188
/*
189 190 191
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
 * We limit the mappings to the region from _text to _end.  _end is
 * rounded up to the 2MB boundary. This catches the invalid pmds as
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
	unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
	pmd_t *pmd = level2_kernel_pgt;
	pmd_t *last_pmd = pmd + PTRS_PER_PMD;

	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
		if (!pmd_present(*pmd))
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

L
Linus Torvalds 已提交
216
/* NOTE: this is meant to be run only at boot */
T
Thomas Gleixner 已提交
217 218
void __init
__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
L
Linus Torvalds 已提交
219 220 221 222
{
	unsigned long address = __fix_to_virt(idx);

	if (idx >= __end_of_fixed_addresses) {
223
		printk(KERN_ERR "Invalid __set_fixmap\n");
L
Linus Torvalds 已提交
224 225 226 227 228
		return;
	}
	set_pte_phys(address, phys, prot);
}

229 230
static unsigned long __initdata table_start;
static unsigned long __meminitdata table_end;
L
Linus Torvalds 已提交
231

232
static __meminit void *alloc_low_page(unsigned long *phys)
T
Thomas Gleixner 已提交
233
{
234
	unsigned long pfn = table_end++;
L
Linus Torvalds 已提交
235 236
	void *adr;

237 238 239
	if (after_bootmem) {
		adr = (void *)get_zeroed_page(GFP_ATOMIC);
		*phys = __pa(adr);
T
Thomas Gleixner 已提交
240

241 242 243
		return adr;
	}

T
Thomas Gleixner 已提交
244 245
	if (pfn >= end_pfn)
		panic("alloc_low_page: ran out of memory");
246 247

	adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
248
	memset(adr, 0, PAGE_SIZE);
249 250 251
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
L
Linus Torvalds 已提交
252

253
static __meminit void unmap_low_page(void *adr)
T
Thomas Gleixner 已提交
254
{
255 256 257
	if (after_bootmem)
		return;

258
	early_iounmap(adr, PAGE_SIZE);
T
Thomas Gleixner 已提交
259
}
L
Linus Torvalds 已提交
260

261
/* Must run before zap_low_mappings */
262
__meminit void *early_ioremap(unsigned long addr, unsigned long size)
263
{
264
	pmd_t *pmd, *last_pmd;
T
Thomas Gleixner 已提交
265
	unsigned long vaddr;
266 267 268 269 270 271
	int i, pmds;

	pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
	vaddr = __START_KERNEL_map;
	pmd = level2_kernel_pgt;
	last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
T
Thomas Gleixner 已提交
272

273 274 275
	for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
		for (i = 0; i < pmds; i++) {
			if (pmd_present(pmd[i]))
T
Thomas Gleixner 已提交
276
				goto continue_outer_loop;
277 278 279
		}
		vaddr += addr & ~PMD_MASK;
		addr &= PMD_MASK;
T
Thomas Gleixner 已提交
280

281
		for (i = 0; i < pmds; i++, addr += PMD_SIZE)
282
			set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
A
Andi Kleen 已提交
283
		__flush_tlb_all();
T
Thomas Gleixner 已提交
284

285
		return (void *)vaddr;
T
Thomas Gleixner 已提交
286
continue_outer_loop:
287
		;
288
	}
289
	printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size);
T
Thomas Gleixner 已提交
290

291
	return NULL;
292 293
}

T
Thomas Gleixner 已提交
294 295 296
/*
 * To avoid virtual aliases later:
 */
297
__meminit void early_iounmap(void *addr, unsigned long size)
298
{
299 300 301 302 303 304 305
	unsigned long vaddr;
	pmd_t *pmd;
	int i, pmds;

	vaddr = (unsigned long)addr;
	pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
	pmd = level2_kernel_pgt + pmd_index(vaddr);
T
Thomas Gleixner 已提交
306

307 308
	for (i = 0; i < pmds; i++)
		pmd_clear(pmd + i);
T
Thomas Gleixner 已提交
309

A
Andi Kleen 已提交
310
	__flush_tlb_all();
311 312
}

313
static unsigned long __meminit
314
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
315
{
316
	int i = pmd_index(address);
317

318 319
	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
		pmd_t *pmd = pmd_page + pmd_index(address);
320

321
		if (address >= end) {
T
Thomas Gleixner 已提交
322
			if (!after_bootmem) {
323 324
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
T
Thomas Gleixner 已提交
325
			}
326 327
			break;
		}
328 329 330 331

		if (pmd_val(*pmd))
			continue;

332 333
		set_pte((pte_t *)pmd,
			pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
334
	}
335
	return address;
336 337
}

338
static unsigned long __meminit
339 340
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
{
T
Thomas Gleixner 已提交
341
	pmd_t *pmd = pmd_offset(pud, 0);
342 343
	unsigned long last_map_addr;

344
	spin_lock(&init_mm.page_table_lock);
345
	last_map_addr = phys_pmd_init(pmd, address, end);
346 347
	spin_unlock(&init_mm.page_table_lock);
	__flush_tlb_all();
348
	return last_map_addr;
349 350
}

351
static unsigned long __meminit
T
Thomas Gleixner 已提交
352 353
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
{
354
	unsigned long last_map_addr = end;
355
	int i = pud_index(addr);
356

T
Thomas Gleixner 已提交
357
	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
358 359
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
360 361
		pmd_t *pmd;

362
		if (addr >= end)
L
Linus Torvalds 已提交
363 364
			break;

T
Thomas Gleixner 已提交
365 366 367
		if (!after_bootmem &&
				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
			set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
368
			continue;
T
Thomas Gleixner 已提交
369
		}
L
Linus Torvalds 已提交
370

371
		if (pud_val(*pud)) {
372
			if (!pud_large(*pud))
373
				last_map_addr = phys_pmd_update(pud, addr, end);
374 375 376 377 378 379
			continue;
		}

		if (direct_gbpages) {
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
380
			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
381 382 383
			continue;
		}

384
		pmd = alloc_low_page(&pmd_phys);
T
Thomas Gleixner 已提交
385

386
		spin_lock(&init_mm.page_table_lock);
L
Linus Torvalds 已提交
387
		set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
388
		last_map_addr = phys_pmd_init(pmd, addr, end);
389
		spin_unlock(&init_mm.page_table_lock);
T
Thomas Gleixner 已提交
390

391
		unmap_low_page(pmd);
L
Linus Torvalds 已提交
392
	}
A
Andi Kleen 已提交
393
	__flush_tlb_all();
394 395

	return last_map_addr >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
396
}
L
Linus Torvalds 已提交
397 398 399

static void __init find_early_table_space(unsigned long end)
{
400
	unsigned long puds, pmds, tables, start;
L
Linus Torvalds 已提交
401 402

	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
403 404 405 406 407
	tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
	if (!direct_gbpages) {
		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
		tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
	}
L
Linus Torvalds 已提交
408

T
Thomas Gleixner 已提交
409 410 411 412 413 414
	/*
	 * RED-PEN putting page tables only on node 0 could
	 * cause a hotspot and fill up ZONE_DMA. The page tables
	 * need roughly 0.5KB per GB.
	 */
	start = 0x8000;
415
	table_start = find_e820_area(start, end, tables, PAGE_SIZE);
L
Linus Torvalds 已提交
416 417 418 419 420
	if (table_start == -1UL)
		panic("Cannot find space for the kernel page tables");

	table_start >>= PAGE_SHIFT;
	table_end = table_start;
421 422

	early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
423 424
		end, table_start << PAGE_SHIFT,
		(table_start << PAGE_SHIFT) + tables);
L
Linus Torvalds 已提交
425 426
}

427 428 429 430 431 432 433 434
static void __init init_gbpages(void)
{
	if (direct_gbpages && cpu_has_gbpages)
		printk(KERN_INFO "Using GB pages for direct mapping\n");
	else
		direct_gbpages = 0;
}

Y
Yinghai Lu 已提交
435 436 437 438
#ifdef CONFIG_MEMTEST_BOOTPARAM

static void __init memtest(unsigned long start_phys, unsigned long size,
				 unsigned pattern)
Y
Yinghai Lu 已提交
439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480
{
	unsigned long i;
	unsigned long *start;
	unsigned long start_bad;
	unsigned long last_bad;
	unsigned long val;
	unsigned long start_phys_aligned;
	unsigned long count;
	unsigned long incr;

	switch (pattern) {
	case 0:
		val = 0UL;
		break;
	case 1:
		val = -1UL;
		break;
	case 2:
		val = 0x5555555555555555UL;
		break;
	case 3:
		val = 0xaaaaaaaaaaaaaaaaUL;
		break;
	default:
		return;
	}

	incr = sizeof(unsigned long);
	start_phys_aligned = ALIGN(start_phys, incr);
	count = (size - (start_phys_aligned - start_phys))/incr;
	start = __va(start_phys_aligned);
	start_bad = 0;
	last_bad = 0;

	for (i = 0; i < count; i++)
		start[i] = val;
	for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
		if (*start != val) {
			if (start_phys_aligned == last_bad + incr) {
				last_bad += incr;
			} else {
				if (start_bad) {
Y
Yinghai Lu 已提交
481
					printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
Y
Yinghai Lu 已提交
482 483 484 485 486 487 488 489
						val, start_bad, last_bad + incr);
					reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
				}
				start_bad = last_bad = start_phys_aligned;
			}
		}
	}
	if (start_bad) {
Y
Yinghai Lu 已提交
490
		printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
Y
Yinghai Lu 已提交
491 492 493 494 495 496
			val, start_bad, last_bad + incr);
		reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
	}

}

Y
Yinghai Lu 已提交
497 498
static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;

Y
Yinghai Lu 已提交
499 500 501
static int __init parse_memtest(char *arg)
{
	if (arg)
Y
Yinghai Lu 已提交
502
		memtest_pattern = simple_strtoul(arg, NULL, 0);
Y
Yinghai Lu 已提交
503 504 505 506 507 508 509 510 511 512
	return 0;
}

early_param("memtest", parse_memtest);

static void __init early_memtest(unsigned long start, unsigned long end)
{
	unsigned long t_start, t_size;
	unsigned pattern;

Y
Yinghai Lu 已提交
513 514 515 516
	if (!memtest_pattern)
		return;

	printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
Y
Yinghai Lu 已提交
517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536
	for (pattern = 0; pattern < memtest_pattern; pattern++) {
		t_start = start;
		t_size = 0;
		while (t_start < end) {
			t_start = find_e820_area_size(t_start, &t_size, 1);

			/* done ? */
			if (t_start >= end)
				break;
			if (t_start + t_size > end)
				t_size = end - t_start;

			printk(KERN_CONT "\n  %016lx - %016lx pattern %d",
				t_start, t_start + t_size, pattern);

			memtest(t_start, t_size, pattern);

			t_start += t_size;
		}
	}
Y
Yinghai Lu 已提交
537
	printk(KERN_CONT "\n");
Y
Yinghai Lu 已提交
538
}
Y
Yinghai Lu 已提交
539 540 541 542 543
#else
static void __init early_memtest(unsigned long start, unsigned long end)
{
}
#endif
Y
Yinghai Lu 已提交
544

T
Thomas Gleixner 已提交
545 546 547 548 549
/*
 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
 * This runs before bootmem is initialized and gets pages directly from
 * the physical memory. To access them they are temporarily mapped.
 */
550
unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
T
Thomas Gleixner 已提交
551
{
552
	unsigned long next, last_map_addr = end;
Y
Yinghai Lu 已提交
553
	unsigned long start_phys = start, end_phys = end;
L
Linus Torvalds 已提交
554

Y
Yinghai Lu 已提交
555
	printk(KERN_INFO "init_memory_mapping\n");
L
Linus Torvalds 已提交
556

T
Thomas Gleixner 已提交
557
	/*
L
Linus Torvalds 已提交
558
	 * Find space for the kernel direct mapping tables.
T
Thomas Gleixner 已提交
559 560 561 562
	 *
	 * Later we should allocate these tables in the local node of the
	 * memory mapped. Unfortunately this is done currently before the
	 * nodes are discovered.
L
Linus Torvalds 已提交
563
	 */
564 565
	if (!after_bootmem) {
		init_gbpages();
566
		find_early_table_space(end);
567
	}
L
Linus Torvalds 已提交
568 569 570 571 572

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);

	for (; start < end; start = next) {
573
		pgd_t *pgd = pgd_offset_k(start);
T
Thomas Gleixner 已提交
574
		unsigned long pud_phys;
575 576 577
		pud_t *pud;

		if (after_bootmem)
578
			pud = pud_offset(pgd, start & PGDIR_MASK);
579
		else
580
			pud = alloc_low_page(&pud_phys);
581

L
Linus Torvalds 已提交
582
		next = start + PGDIR_SIZE;
T
Thomas Gleixner 已提交
583 584
		if (next > end)
			next = end;
585
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
586 587
		if (!after_bootmem)
			set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
588
		unmap_low_page(pud);
T
Thomas Gleixner 已提交
589
	}
L
Linus Torvalds 已提交
590

591
	if (!after_bootmem)
592
		mmu_cr4_features = read_cr4();
L
Linus Torvalds 已提交
593
	__flush_tlb_all();
594

595 596 597
	if (!after_bootmem)
		reserve_early(table_start << PAGE_SHIFT,
				 table_end << PAGE_SHIFT, "PGTABLE");
Y
Yinghai Lu 已提交
598 599 600

	if (!after_bootmem)
		early_memtest(start_phys, end_phys);
601 602

	return last_map_addr;
L
Linus Torvalds 已提交
603 604
}

605
#ifndef CONFIG_NUMA
L
Linus Torvalds 已提交
606 607
void __init paging_init(void)
{
608
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
609

610 611 612 613 614
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
	max_zone_pfns[ZONE_NORMAL] = end_pfn;

615 616
	memory_present(0, 0, end_pfn);
	sparse_init();
617
	free_area_init_nodes(max_zone_pfns);
L
Linus Torvalds 已提交
618 619 620
}
#endif

621 622 623 624 625 626
/*
 * Memory hotplug specific functions
 */
void online_page(struct page *page)
{
	ClearPageReserved(page);
627
	init_page_count(page);
628 629 630 631 632
	__free_page(page);
	totalram_pages++;
	num_physpages++;
}

633
#ifdef CONFIG_MEMORY_HOTPLUG
634 635 636 637
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
638
int arch_add_memory(int nid, u64 start, u64 size)
639
{
640
	struct pglist_data *pgdat = NODE_DATA(nid);
641
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
642
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
643 644 645
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

646 647 648
	last_mapped_pfn = init_memory_mapping(start, start + size-1);
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
649

650
	ret = __add_pages(zone, start_pfn, nr_pages);
651
	WARN_ON(1);
652 653 654

	return ret;
}
655
EXPORT_SYMBOL_GPL(arch_add_memory);
656

657
#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
658 659 660 661
int memory_add_physaddr_to_nid(u64 start)
{
	return 0;
}
662
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
663 664
#endif

665 666
#endif /* CONFIG_MEMORY_HOTPLUG */

T
Thomas Gleixner 已提交
667 668
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
			 kcore_modules, kcore_vsyscall;
L
Linus Torvalds 已提交
669 670 671

void __init mem_init(void)
{
672
	long codesize, reservedpages, datasize, initsize;
L
Linus Torvalds 已提交
673

674
	pci_iommu_alloc();
L
Linus Torvalds 已提交
675

676
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
677 678 679 680

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
681
#ifdef CONFIG_NUMA
682
	totalram_pages = numa_free_all_bootmem();
L
Linus Torvalds 已提交
683
#else
684
	totalram_pages = free_all_bootmem();
L
Linus Torvalds 已提交
685
#endif
686 687
	reservedpages = end_pfn - totalram_pages -
					absent_pages_in_range(0, end_pfn);
L
Linus Torvalds 已提交
688 689 690 691 692 693 694
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
695 696
	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
L
Linus Torvalds 已提交
697 698 699
		   VMALLOC_END-VMALLOC_START);
	kclist_add(&kcore_kernel, &_stext, _end - _stext);
	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
T
Thomas Gleixner 已提交
700
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
L
Linus Torvalds 已提交
701 702
				 VSYSCALL_END - VSYSCALL_START);

703
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
T
Thomas Gleixner 已提交
704
				"%ldk reserved, %ldk data, %ldk init)\n",
L
Linus Torvalds 已提交
705 706 707 708 709 710
		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
		end_pfn << (PAGE_SHIFT-10),
		codesize >> 10,
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
711 712

	cpa_init();
L
Linus Torvalds 已提交
713 714
}

715
void free_init_pages(char *what, unsigned long begin, unsigned long end)
L
Linus Torvalds 已提交
716
{
717
	unsigned long addr = begin;
L
Linus Torvalds 已提交
718

719
	if (addr >= end)
720 721
		return;

I
Ingo Molnar 已提交
722 723 724 725 726 727 728 729 730 731
	/*
	 * If debugging page accesses then do not free this memory but
	 * mark them not present - any buggy init-section access will
	 * create a kernel page fault:
	 */
#ifdef CONFIG_DEBUG_PAGEALLOC
	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
		begin, PAGE_ALIGN(end));
	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
#else
732
	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
T
Thomas Gleixner 已提交
733

734
	for (; addr < end; addr += PAGE_SIZE) {
735 736 737 738 739
		ClearPageReserved(virt_to_page(addr));
		init_page_count(virt_to_page(addr));
		memset((void *)(addr & ~(PAGE_SIZE-1)),
			POISON_FREE_INITMEM, PAGE_SIZE);
		free_page(addr);
L
Linus Torvalds 已提交
740 741
		totalram_pages++;
	}
I
Ingo Molnar 已提交
742
#endif
743 744 745 746 747
}

void free_initmem(void)
{
	free_init_pages("unused kernel memory",
748 749
			(unsigned long)(&__init_begin),
			(unsigned long)(&__init_end));
L
Linus Torvalds 已提交
750 751
}

752
#ifdef CONFIG_DEBUG_RODATA
753 754
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
755 756 757

void mark_rodata_ro(void)
{
758
	unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
759

760
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
761
	       (end - start) >> 10);
762 763 764 765 766 767 768 769
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
	start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
	set_memory_nx(start, (end - start) >> PAGE_SHIFT);
770

771 772
	rodata_test();

773
#ifdef CONFIG_CPA_DEBUG
774
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
775
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
776

777
	printk(KERN_INFO "Testing CPA: again\n");
778
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
779
#endif
780
}
781

782 783
#endif

L
Linus Torvalds 已提交
784 785 786
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
787
	free_init_pages("initrd memory", start, end);
L
Linus Torvalds 已提交
788 789 790
}
#endif

T
Thomas Gleixner 已提交
791 792
void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
{
793
#ifdef CONFIG_NUMA
L
Linus Torvalds 已提交
794
	int nid = phys_to_nid(phys);
795 796
#endif
	unsigned long pfn = phys >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
797

798
	if (pfn >= end_pfn) {
T
Thomas Gleixner 已提交
799 800 801 802
		/*
		 * This can happen with kdump kernels when accessing
		 * firmware tables:
		 */
803
		if (pfn < max_pfn_mapped)
804
			return;
T
Thomas Gleixner 已提交
805

806 807 808 809 810 811 812
		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
				phys, len);
		return;
	}

	/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
813
	reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
T
Thomas Gleixner 已提交
814
#else
815
	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
L
Linus Torvalds 已提交
816
#endif
817
	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
818
		dma_reserve += len / PAGE_SIZE;
819 820
		set_dma_reserve(dma_reserve);
	}
L
Linus Torvalds 已提交
821 822
}

T
Thomas Gleixner 已提交
823 824
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
825
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
826 827 828 829
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
830 831

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
832 833
		return 0;

L
Linus Torvalds 已提交
834 835 836 837 838 839
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
840
		return 0;
L
Linus Torvalds 已提交
841 842 843 844

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
845

L
Linus Torvalds 已提交
846 847 848 849 850 851
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
852

L
Linus Torvalds 已提交
853 854 855
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
856 857 858 859 860
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
861
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
862 863 864 865
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
866 867 868 869 870
};

struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
#ifdef CONFIG_IA32_EMULATION
871 872
	if (test_tsk_thread_flag(tsk, TIF_IA32))
		return NULL;
L
Linus Torvalds 已提交
873 874 875 876 877 878 879
#endif
	return &gate_vma;
}

int in_gate_area(struct task_struct *task, unsigned long addr)
{
	struct vm_area_struct *vma = get_gate_vma(task);
T
Thomas Gleixner 已提交
880

881 882
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
883

L
Linus Torvalds 已提交
884 885 886
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
887 888 889 890
/*
 * Use this when you have no reliable task/vma, typically from interrupt
 * context. It is less reliable than using the task's vma and may give
 * false positives:
L
Linus Torvalds 已提交
891 892 893
 */
int in_gate_area_no_task(unsigned long addr)
{
894
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
895
}
896

897 898 899 900 901 902 903 904
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
905 906 907 908 909

#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
T
Thomas Gleixner 已提交
910 911
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
912 913 914 915 916 917 918 919 920 921 922 923 924 925
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
		next = pmd_addr_end(addr, end);

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
926

927 928 929 930 931 932 933
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

		pmd = pmd_offset(pud, addr);
		if (pmd_none(*pmd)) {
			pte_t entry;
T
Thomas Gleixner 已提交
934 935 936
			void *p;

			p = vmemmap_alloc_block(PMD_SIZE, node);
937 938 939
			if (!p)
				return -ENOMEM;

T
Thomas Gleixner 已提交
940 941
			entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
							PAGE_KERNEL_LARGE);
942 943 944 945
			set_pmd(pmd, __pmd(pte_val(entry)));

			printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
				addr, addr + PMD_SIZE - 1, p, node);
T
Thomas Gleixner 已提交
946
		} else {
947
			vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
948
		}
949 950 951 952
	}
	return 0;
}
#endif