init_64.c 27.1 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
T
Thomas Gleixner 已提交
21
#include <linux/initrd.h>
L
Linus Torvalds 已提交
22 23 24
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
25
#include <linux/pci.h>
26
#include <linux/pfn.h>
27
#include <linux/poison.h>
28
#include <linux/dma-mapping.h>
29 30
#include <linux/module.h>
#include <linux/memory_hotplug.h>
31
#include <linux/nmi.h>
L
Linus Torvalds 已提交
32 33 34 35 36 37 38 39 40 41 42 43 44 45

#include <asm/processor.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
46
#include <asm/sections.h>
47
#include <asm/kdebug.h>
48
#include <asm/numa.h>
49
#include <asm/cacheflush.h>
L
Linus Torvalds 已提交
50

51 52 53 54 55
/*
 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
 * The direct mapping extends to max_pfn_mapped, so that we can directly access
 * apertures, ACPI and other tables without having to play with fixmaps.
 */
56
unsigned long max_low_pfn_mapped;
57 58
unsigned long max_pfn_mapped;

59 60
static unsigned long dma_reserve __initdata;

L
Linus Torvalds 已提交
61 62
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);

H
Hugh Dickins 已提交
63
int direct_gbpages
I
Ingo Molnar 已提交
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
#ifdef CONFIG_DIRECT_GBPAGES
				= 1
#endif
;

static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
83 84 85 86 87 88 89 90
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

int after_bootmem;

91 92 93 94 95
/*
 * NOTE: This function is marked __ref because it calls __init function
 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
 */
static __ref void *spp_getpage(void)
T
Thomas Gleixner 已提交
96
{
L
Linus Torvalds 已提交
97
	void *ptr;
T
Thomas Gleixner 已提交
98

L
Linus Torvalds 已提交
99
	if (after_bootmem)
T
Thomas Gleixner 已提交
100
		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
L
Linus Torvalds 已提交
101 102
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
103 104 105 106 107

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
108

109
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
110

L
Linus Torvalds 已提交
111
	return ptr;
T
Thomas Gleixner 已提交
112
}
L
Linus Torvalds 已提交
113

114
void
115
set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
L
Linus Torvalds 已提交
116 117 118
{
	pud_t *pud;
	pmd_t *pmd;
119
	pte_t *pte;
L
Linus Torvalds 已提交
120

121
	pud = pud_page + pud_index(vaddr);
L
Linus Torvalds 已提交
122
	if (pud_none(*pud)) {
T
Thomas Gleixner 已提交
123
		pmd = (pmd_t *) spp_getpage();
124
		pud_populate(&init_mm, pud, pmd);
L
Linus Torvalds 已提交
125
		if (pmd != pmd_offset(pud, 0)) {
126
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
T
Thomas Gleixner 已提交
127
				pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
128 129 130 131 132 133
			return;
		}
	}
	pmd = pmd_offset(pud, vaddr);
	if (pmd_none(*pmd)) {
		pte = (pte_t *) spp_getpage();
134
		pmd_populate_kernel(&init_mm, pmd, pte);
L
Linus Torvalds 已提交
135
		if (pte != pte_offset_kernel(pmd, 0)) {
136
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
137 138 139 140 141
			return;
		}
	}

	pte = pte_offset_kernel(pmd, vaddr);
I
Ingo Molnar 已提交
142
	if (!pte_none(*pte) && pte_val(new_pte) &&
L
Linus Torvalds 已提交
143 144 145 146 147 148 149 150 151 152 153
	    pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
		pte_ERROR(*pte);
	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
void
set_pte_vaddr(unsigned long vaddr, pte_t pteval)
{
	pgd_t *pgd;
	pud_t *pud_page;

	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
		return;
	}
	pud_page = (pud_t*)pgd_page_vaddr(*pgd);
	set_pte_vaddr_pud(pud_page, vaddr, pteval);
}

172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211
/*
 * Create large page table mappings for a range of physical addresses.
 */
static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
						pgprot_t prot)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
		pgd = pgd_offset_k((unsigned long)__va(phys));
		if (pgd_none(*pgd)) {
			pud = (pud_t *) spp_getpage();
			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pud = pud_offset(pgd, (unsigned long)__va(phys));
		if (pud_none(*pud)) {
			pmd = (pmd_t *) spp_getpage();
			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pmd = pmd_offset(pud, phys);
		BUG_ON(!pmd_none(*pmd));
		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
	}
}

void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
}

void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
}

212
/*
213 214 215
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
 * We limit the mappings to the region from _text to _end.  _end is
 * rounded up to the 2MB boundary. This catches the invalid pmds as
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
	unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
	pmd_t *pmd = level2_kernel_pgt;
	pmd_t *last_pmd = pmd + PTRS_PER_PMD;

	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
233
		if (pmd_none(*pmd))
234 235 236 237 238 239
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

240 241
static unsigned long __initdata table_start;
static unsigned long __meminitdata table_end;
242
static unsigned long __meminitdata table_top;
L
Linus Torvalds 已提交
243

244
static __meminit void *alloc_low_page(unsigned long *phys)
T
Thomas Gleixner 已提交
245
{
246
	unsigned long pfn = table_end++;
L
Linus Torvalds 已提交
247 248
	void *adr;

249 250 251
	if (after_bootmem) {
		adr = (void *)get_zeroed_page(GFP_ATOMIC);
		*phys = __pa(adr);
T
Thomas Gleixner 已提交
252

253 254 255
		return adr;
	}

256
	if (pfn >= table_top)
T
Thomas Gleixner 已提交
257
		panic("alloc_low_page: ran out of memory");
258 259

	adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
260
	memset(adr, 0, PAGE_SIZE);
261 262 263
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
L
Linus Torvalds 已提交
264

265
static __meminit void unmap_low_page(void *adr)
T
Thomas Gleixner 已提交
266
{
267 268 269
	if (after_bootmem)
		return;

270
	early_iounmap(adr, PAGE_SIZE);
T
Thomas Gleixner 已提交
271
}
L
Linus Torvalds 已提交
272

273
static unsigned long __meminit
274 275 276
phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
{
	unsigned pages = 0;
277
	unsigned long last_map_addr = end;
278
	int i;
279

280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298
	pte_t *pte = pte_page + pte_index(addr);

	for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {

		if (addr >= end) {
			if (!after_bootmem) {
				for(; i < PTRS_PER_PTE; i++, pte++)
					set_pte(pte, __pte(0));
			}
			break;
		}

		if (pte_val(*pte))
			continue;

		if (0)
			printk("   pte=%p addr=%lx pte=%016lx\n",
			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
299
		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
300 301 302
		pages++;
	}
	update_page_count(PG_LEVEL_4K, pages);
303 304

	return last_map_addr;
305 306
}

307
static unsigned long __meminit
308 309 310 311
phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
{
	pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);

312
	return phys_pte_init(pte, address, end);
313 314
}

315
static unsigned long __meminit
316 317
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
			 unsigned long page_size_mask)
318
{
319
	unsigned long pages = 0;
320
	unsigned long last_map_addr = end;
H
Hugh Dickins 已提交
321
	unsigned long start = address;
322

323
	int i = pmd_index(address);
324

325
	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
326
		unsigned long pte_phys;
327
		pmd_t *pmd = pmd_page + pmd_index(address);
328
		pte_t *pte;
329

330
		if (address >= end) {
T
Thomas Gleixner 已提交
331
			if (!after_bootmem) {
332 333
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
T
Thomas Gleixner 已提交
334
			}
335 336
			break;
		}
337

338
		if (pmd_val(*pmd)) {
339
			if (!pmd_large(*pmd))
340 341
				last_map_addr = phys_pte_update(pmd, address,
								 end);
H
Hugh Dickins 已提交
342 343 344
			/* Count entries we're using from level2_ident_pgt */
			if (start == 0)
				pages++;
345 346 347
			continue;
		}

348
		if (page_size_mask & (1<<PG_LEVEL_2M)) {
349 350 351
			pages++;
			set_pte((pte_t *)pmd,
				pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
352
			last_map_addr = (address & PMD_MASK) + PMD_SIZE;
353
			continue;
354
		}
355

356
		pte = alloc_low_page(&pte_phys);
357
		last_map_addr = phys_pte_init(pte, address, end);
358 359 360
		unmap_low_page(pte);

		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
361
	}
362
	update_page_count(PG_LEVEL_2M, pages);
363
	return last_map_addr;
364 365
}

366
static unsigned long __meminit
367 368
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
			 unsigned long page_size_mask)
369
{
T
Thomas Gleixner 已提交
370
	pmd_t *pmd = pmd_offset(pud, 0);
371 372
	unsigned long last_map_addr;

373
	spin_lock(&init_mm.page_table_lock);
374
	last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
375 376
	spin_unlock(&init_mm.page_table_lock);
	__flush_tlb_all();
377
	return last_map_addr;
378 379
}

380
static unsigned long __meminit
381 382
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
			 unsigned long page_size_mask)
T
Thomas Gleixner 已提交
383
{
384
	unsigned long pages = 0;
385
	unsigned long last_map_addr = end;
386
	int i = pud_index(addr);
387

T
Thomas Gleixner 已提交
388
	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
389 390
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
391 392
		pmd_t *pmd;

393
		if (addr >= end)
L
Linus Torvalds 已提交
394 395
			break;

T
Thomas Gleixner 已提交
396 397 398
		if (!after_bootmem &&
				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
			set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
399
			continue;
T
Thomas Gleixner 已提交
400
		}
L
Linus Torvalds 已提交
401

402
		if (pud_val(*pud)) {
403
			if (!pud_large(*pud))
404 405
				last_map_addr = phys_pmd_update(pud, addr, end,
							 page_size_mask);
406 407 408
			continue;
		}

409
		if (page_size_mask & (1<<PG_LEVEL_1G)) {
410
			pages++;
411 412
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
413
			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
414 415 416
			continue;
		}

417
		pmd = alloc_low_page(&pmd_phys);
T
Thomas Gleixner 已提交
418

419
		spin_lock(&init_mm.page_table_lock);
420
		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
421 422
		unmap_low_page(pmd);
		pud_populate(&init_mm, pud, __va(pmd_phys));
423
		spin_unlock(&init_mm.page_table_lock);
T
Thomas Gleixner 已提交
424

L
Linus Torvalds 已提交
425
	}
A
Andi Kleen 已提交
426
	__flush_tlb_all();
427
	update_page_count(PG_LEVEL_1G, pages);
428

429
	return last_map_addr;
T
Thomas Gleixner 已提交
430
}
L
Linus Torvalds 已提交
431

432
static unsigned long __meminit
433 434
phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
		 unsigned long page_size_mask)
435 436 437 438 439
{
	pud_t *pud;

	pud = (pud_t *)pgd_page_vaddr(*pgd);

440
	return phys_pud_init(pud, addr, end, page_size_mask);
441 442
}

L
Linus Torvalds 已提交
443 444
static void __init find_early_table_space(unsigned long end)
{
445
	unsigned long puds, pmds, ptes, tables, start;
L
Linus Torvalds 已提交
446 447

	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
448
	tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
449 450 451 452 453 454 455 456 457 458 459 460 461 462 463
	if (direct_gbpages) {
		unsigned long extra;
		extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
		pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
	} else
		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
	tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);

	if (cpu_has_pse) {
		unsigned long extra;
		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
	} else
		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
	tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
L
Linus Torvalds 已提交
464

T
Thomas Gleixner 已提交
465 466 467 468 469 470
	/*
	 * RED-PEN putting page tables only on node 0 could
	 * cause a hotspot and fill up ZONE_DMA. The page tables
	 * need roughly 0.5KB per GB.
	 */
	start = 0x8000;
471
	table_start = find_e820_area(start, end, tables, PAGE_SIZE);
L
Linus Torvalds 已提交
472 473 474 475 476
	if (table_start == -1UL)
		panic("Cannot find space for the kernel page tables");

	table_start >>= PAGE_SHIFT;
	table_end = table_start;
477
	table_top = table_start + (tables >> PAGE_SHIFT);
478

479 480
	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
		end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
L
Linus Torvalds 已提交
481 482
}

483 484 485 486 487 488 489 490
static void __init init_gbpages(void)
{
	if (direct_gbpages && cpu_has_gbpages)
		printk(KERN_INFO "Using GB pages for direct mapping\n");
	else
		direct_gbpages = 0;
}

491 492 493
static unsigned long __init kernel_physical_mapping_init(unsigned long start,
						unsigned long end,
						unsigned long page_size_mask)
T
Thomas Gleixner 已提交
494
{
L
Linus Torvalds 已提交
495

496
	unsigned long next, last_map_addr = end;
L
Linus Torvalds 已提交
497 498 499 500 501

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);

	for (; start < end; start = next) {
502
		pgd_t *pgd = pgd_offset_k(start);
T
Thomas Gleixner 已提交
503
		unsigned long pud_phys;
504 505
		pud_t *pud;

506
		next = (start + PGDIR_SIZE) & PGDIR_MASK;
507 508 509 510
		if (next > end)
			next = end;

		if (pgd_val(*pgd)) {
511 512
			last_map_addr = phys_pud_update(pgd, __pa(start),
						 __pa(end), page_size_mask);
513 514 515
			continue;
		}

516
		if (after_bootmem)
517
			pud = pud_offset(pgd, start & PGDIR_MASK);
518
		else
519
			pud = alloc_low_page(&pud_phys);
520

521 522
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
						 page_size_mask);
523
		unmap_low_page(pud);
524 525
		pgd_populate(&init_mm, pgd_offset_k(start),
			     __va(pud_phys));
T
Thomas Gleixner 已提交
526
	}
L
Linus Torvalds 已提交
527

528 529
	return last_map_addr;
}
530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555

struct map_range {
	unsigned long start;
	unsigned long end;
	unsigned page_size_mask;
};

#define NR_RANGE_MR 5

static int save_mr(struct map_range *mr, int nr_range,
		   unsigned long start_pfn, unsigned long end_pfn,
		   unsigned long page_size_mask)
{

	if (start_pfn < end_pfn) {
		if (nr_range >= NR_RANGE_MR)
			panic("run out of range for init_memory_mapping\n");
		mr[nr_range].start = start_pfn<<PAGE_SHIFT;
		mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
		mr[nr_range].page_size_mask = page_size_mask;
		nr_range++;
	}

	return nr_range;
}

556 557 558 559 560 561 562 563
/*
 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
 * This runs before bootmem is initialized and gets pages directly from
 * the physical memory. To access them they are temporarily mapped.
 */
unsigned long __init_refok init_memory_mapping(unsigned long start,
					       unsigned long end)
{
564
	unsigned long last_map_addr = 0;
565
	unsigned long page_size_mask = 0;
566
	unsigned long start_pfn, end_pfn;
567

568 569 570
	struct map_range mr[NR_RANGE_MR];
	int nr_range, i;

571 572 573 574 575 576 577 578 579
	printk(KERN_INFO "init_memory_mapping\n");

	/*
	 * Find space for the kernel direct mapping tables.
	 *
	 * Later we should allocate these tables in the local node of the
	 * memory mapped. Unfortunately this is done currently before the
	 * nodes are discovered.
	 */
580
	if (!after_bootmem)
581 582 583 584 585 586 587
		init_gbpages();

	if (direct_gbpages)
		page_size_mask |= 1 << PG_LEVEL_1G;
	if (cpu_has_pse)
		page_size_mask |= 1 << PG_LEVEL_2M;

588 589 590 591
	memset(mr, 0, sizeof(mr));
	nr_range = 0;

	/* head if not big page alignment ?*/
592 593 594
	start_pfn = start >> PAGE_SHIFT;
	end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
			<< (PMD_SHIFT - PAGE_SHIFT);
595
	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
596 597 598 599 600 601 602 603

	/* big page (2M) range*/
	start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
			 << (PMD_SHIFT - PAGE_SHIFT);
	end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
			 << (PUD_SHIFT - PAGE_SHIFT);
	if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
		end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
604 605
	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
			page_size_mask & (1<<PG_LEVEL_2M));
606 607 608 609

	/* big page (1G) range */
	start_pfn = end_pfn;
	end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
610 611 612
	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
				page_size_mask &
				 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
613 614 615 616

	/* tail is not big page (1G) alignment */
	start_pfn = end_pfn;
	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
617 618 619
	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
			page_size_mask & (1<<PG_LEVEL_2M));

620 621 622
	/* tail is not big page (2M) alignment */
	start_pfn = end_pfn;
	end_pfn = end>>PAGE_SHIFT;
623 624
	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);

Y
Yinghai Lu 已提交
625 626 627 628 629 630 631 632 633 634 635 636 637 638
	/* try to merge same page size and continuous */
	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
		unsigned long old_start;
		if (mr[i].end != mr[i+1].start ||
		    mr[i].page_size_mask != mr[i+1].page_size_mask)
			continue;
		/* move it */
		old_start = mr[i].start;
		memmove(&mr[i], &mr[i+1],
			 (nr_range - 1 - i) * sizeof (struct map_range));
		mr[i].start = old_start;
		nr_range--;
	}

639 640 641 642 643 644 645 646 647 648
	for (i = 0; i < nr_range; i++)
		printk(KERN_DEBUG " %010lx - %010lx page %s\n",
				mr[i].start, mr[i].end,
			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));

	if (!after_bootmem)
		find_early_table_space(end);

	for (i = 0; i < nr_range; i++)
649
		last_map_addr = kernel_physical_mapping_init(
650 651
					mr[i].start, mr[i].end,
					mr[i].page_size_mask);
652

653
	if (!after_bootmem)
654
		mmu_cr4_features = read_cr4();
L
Linus Torvalds 已提交
655
	__flush_tlb_all();
656

657
	if (!after_bootmem && table_end > table_start)
658 659
		reserve_early(table_start << PAGE_SHIFT,
				 table_end << PAGE_SHIFT, "PGTABLE");
Y
Yinghai Lu 已提交
660

661 662 663
	printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
			 last_map_addr, end);

Y
Yinghai Lu 已提交
664
	if (!after_bootmem)
665
		early_memtest(start, end);
666

667
	return last_map_addr >> PAGE_SHIFT;
L
Linus Torvalds 已提交
668 669
}

670
#ifndef CONFIG_NUMA
671 672 673 674 675 676 677 678 679
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long bootmap_size, bootmap;

	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
				 PAGE_SIZE);
	if (bootmap == -1L)
		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
Y
Yinghai Lu 已提交
680 681 682
	/* don't touch min_low_pfn */
	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
					 0, end_pfn);
683 684 685 686 687 688
	e820_register_active_regions(0, start_pfn, end_pfn);
	free_bootmem_with_active_regions(0, end_pfn);
	early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
	reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
}

L
Linus Torvalds 已提交
689 690
void __init paging_init(void)
{
691
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
692

693 694 695
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
Y
Yinghai Lu 已提交
696
	max_zone_pfns[ZONE_NORMAL] = max_pfn;
697

Y
Yinghai Lu 已提交
698
	memory_present(0, 0, max_pfn);
699
	sparse_init();
700
	free_area_init_nodes(max_zone_pfns);
L
Linus Torvalds 已提交
701 702 703
}
#endif

704 705 706
/*
 * Memory hotplug specific functions
 */
707
#ifdef CONFIG_MEMORY_HOTPLUG
708 709 710 711
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
712
int arch_add_memory(int nid, u64 start, u64 size)
713
{
714
	struct pglist_data *pgdat = NODE_DATA(nid);
715
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
716
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
717 718 719
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

720 721 722
	last_mapped_pfn = init_memory_mapping(start, start + size-1);
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
723

724
	ret = __add_pages(zone, start_pfn, nr_pages);
725
	WARN_ON(1);
726 727 728

	return ret;
}
729
EXPORT_SYMBOL_GPL(arch_add_memory);
730

731
#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
732 733 734 735
int memory_add_physaddr_to_nid(u64 start)
{
	return 0;
}
736
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
737 738
#endif

739 740
#endif /* CONFIG_MEMORY_HOTPLUG */

741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760
/*
 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
 * is valid. The argument is a physical page number.
 *
 *
 * On x86, access has to be given to the first megabyte of ram because that area
 * contains bios code and data regions used by X and dosemu and similar apps.
 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
 * mmio resources as well as potential bios/acpi data regions.
 */
int devmem_is_allowed(unsigned long pagenr)
{
	if (pagenr <= 256)
		return 1;
	if (!page_is_ram(pagenr))
		return 1;
	return 0;
}


T
Thomas Gleixner 已提交
761 762
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
			 kcore_modules, kcore_vsyscall;
L
Linus Torvalds 已提交
763 764 765

void __init mem_init(void)
{
766
	long codesize, reservedpages, datasize, initsize;
L
Linus Torvalds 已提交
767

768
	pci_iommu_alloc();
L
Linus Torvalds 已提交
769

770
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
771 772 773 774

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
775
#ifdef CONFIG_NUMA
776
	totalram_pages = numa_free_all_bootmem();
L
Linus Torvalds 已提交
777
#else
778
	totalram_pages = free_all_bootmem();
L
Linus Torvalds 已提交
779
#endif
Y
Yinghai Lu 已提交
780 781
	reservedpages = max_pfn - totalram_pages -
					absent_pages_in_range(0, max_pfn);
L
Linus Torvalds 已提交
782 783 784 785 786 787 788
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
789 790
	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
L
Linus Torvalds 已提交
791 792 793
		   VMALLOC_END-VMALLOC_START);
	kclist_add(&kcore_kernel, &_stext, _end - _stext);
	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
T
Thomas Gleixner 已提交
794
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
L
Linus Torvalds 已提交
795 796
				 VSYSCALL_END - VSYSCALL_START);

797
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
T
Thomas Gleixner 已提交
798
				"%ldk reserved, %ldk data, %ldk init)\n",
L
Linus Torvalds 已提交
799
		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
Y
Yinghai Lu 已提交
800
		max_pfn << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
801 802 803 804
		codesize >> 10,
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
805 806

	cpa_init();
L
Linus Torvalds 已提交
807 808
}

809
void free_init_pages(char *what, unsigned long begin, unsigned long end)
L
Linus Torvalds 已提交
810
{
811
	unsigned long addr = begin;
L
Linus Torvalds 已提交
812

813
	if (addr >= end)
814 815
		return;

I
Ingo Molnar 已提交
816 817 818 819 820 821 822 823 824 825
	/*
	 * If debugging page accesses then do not free this memory but
	 * mark them not present - any buggy init-section access will
	 * create a kernel page fault:
	 */
#ifdef CONFIG_DEBUG_PAGEALLOC
	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
		begin, PAGE_ALIGN(end));
	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
#else
826
	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
T
Thomas Gleixner 已提交
827

828
	for (; addr < end; addr += PAGE_SIZE) {
829 830 831 832 833
		ClearPageReserved(virt_to_page(addr));
		init_page_count(virt_to_page(addr));
		memset((void *)(addr & ~(PAGE_SIZE-1)),
			POISON_FREE_INITMEM, PAGE_SIZE);
		free_page(addr);
L
Linus Torvalds 已提交
834 835
		totalram_pages++;
	}
I
Ingo Molnar 已提交
836
#endif
837 838 839 840 841
}

void free_initmem(void)
{
	free_init_pages("unused kernel memory",
842 843
			(unsigned long)(&__init_begin),
			(unsigned long)(&__init_end));
L
Linus Torvalds 已提交
844 845
}

846
#ifdef CONFIG_DEBUG_RODATA
847 848
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
849 850 851

void mark_rodata_ro(void)
{
852
	unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
853 854 855 856 857 858 859
	unsigned long rodata_start =
		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;

#ifdef CONFIG_DYNAMIC_FTRACE
	/* Dynamic tracing modifies the kernel text section */
	start = rodata_start;
#endif
860

861
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
862
	       (end - start) >> 10);
863 864 865 866 867 868
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
869
	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
870

871 872
	rodata_test();

873
#ifdef CONFIG_CPA_DEBUG
874
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
875
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
876

877
	printk(KERN_INFO "Testing CPA: again\n");
878
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
879
#endif
880
}
881

882 883
#endif

L
Linus Torvalds 已提交
884 885 886
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
887
	free_init_pages("initrd memory", start, end);
L
Linus Torvalds 已提交
888 889 890
}
#endif

891 892
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
				   int flags)
T
Thomas Gleixner 已提交
893
{
894
#ifdef CONFIG_NUMA
895
	int nid, next_nid;
896
	int ret;
897 898
#endif
	unsigned long pfn = phys >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
899

Y
Yinghai Lu 已提交
900
	if (pfn >= max_pfn) {
T
Thomas Gleixner 已提交
901 902 903 904
		/*
		 * This can happen with kdump kernels when accessing
		 * firmware tables:
		 */
905
		if (pfn < max_pfn_mapped)
906
			return -EFAULT;
T
Thomas Gleixner 已提交
907

908
		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
909
				phys, len);
910
		return -EFAULT;
911 912 913 914
	}

	/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
915 916 917
	nid = phys_to_nid(phys);
	next_nid = phys_to_nid(phys + len - 1);
	if (nid == next_nid)
918
		ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
919
	else
920 921 922 923 924
		ret = reserve_bootmem(phys, len, flags);

	if (ret != 0)
		return ret;

T
Thomas Gleixner 已提交
925
#else
926
	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
L
Linus Torvalds 已提交
927
#endif
928

929
	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
930
		dma_reserve += len / PAGE_SIZE;
931 932
		set_dma_reserve(dma_reserve);
	}
933 934

	return 0;
L
Linus Torvalds 已提交
935 936
}

T
Thomas Gleixner 已提交
937 938
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
939
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
940 941 942 943
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
944 945

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
946 947
		return 0;

L
Linus Torvalds 已提交
948 949 950 951 952 953
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
954
		return 0;
L
Linus Torvalds 已提交
955 956 957 958

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
959

L
Linus Torvalds 已提交
960 961 962 963 964 965
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
966

L
Linus Torvalds 已提交
967 968 969
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
970 971 972 973 974
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
975
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
976 977 978 979
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
980 981 982 983 984
};

struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
#ifdef CONFIG_IA32_EMULATION
985 986
	if (test_tsk_thread_flag(tsk, TIF_IA32))
		return NULL;
L
Linus Torvalds 已提交
987 988 989 990 991 992 993
#endif
	return &gate_vma;
}

int in_gate_area(struct task_struct *task, unsigned long addr)
{
	struct vm_area_struct *vma = get_gate_vma(task);
T
Thomas Gleixner 已提交
994

995 996
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
997

L
Linus Torvalds 已提交
998 999 1000
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
1001 1002 1003 1004
/*
 * Use this when you have no reliable task/vma, typically from interrupt
 * context. It is less reliable than using the task's vma and may give
 * false positives:
L
Linus Torvalds 已提交
1005 1006 1007
 */
int in_gate_area_no_task(unsigned long addr)
{
1008
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
1009
}
1010

1011 1012 1013 1014 1015 1016 1017 1018
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
1019 1020 1021 1022 1023

#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
1024 1025 1026 1027
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

T
Thomas Gleixner 已提交
1028 1029
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
1030 1031 1032 1033 1034 1035 1036 1037 1038
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
1039
		void *p = NULL;
1040 1041 1042 1043

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
1044

1045 1046 1047 1048
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

1049 1050 1051 1052 1053 1054 1055 1056
		if (!cpu_has_pse) {
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			pmd = vmemmap_pmd_populate(pud, addr, node);

			if (!pmd)
				return -ENOMEM;

			p = vmemmap_pte_populate(pmd, addr, node);
T
Thomas Gleixner 已提交
1057

1058 1059 1060
			if (!p)
				return -ENOMEM;

1061 1062
			addr_end = addr + PAGE_SIZE;
			p_end = p + PAGE_SIZE;
T
Thomas Gleixner 已提交
1063
		} else {
1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086
			next = pmd_addr_end(addr, end);

			pmd = pmd_offset(pud, addr);
			if (pmd_none(*pmd)) {
				pte_t entry;

				p = vmemmap_alloc_block(PMD_SIZE, node);
				if (!p)
					return -ENOMEM;

				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
						PAGE_KERNEL_LARGE);
				set_pmd(pmd, __pmd(pte_val(entry)));

				/* check to see if we have contiguous blocks */
				if (p_end != p || node_start != node) {
					if (p_start)
						printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						       addr_start, addr_end-1, p_start, p_end-1, node_start);
					addr_start = addr;
					node_start = node;
					p_start = p;
				}
Y
Yinghai Lu 已提交
1087 1088 1089

				addr_end = addr + PMD_SIZE;
				p_end = p + PMD_SIZE;
1090 1091
			} else
				vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
1092
		}
1093

1094 1095 1096
	}
	return 0;
}
1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107

void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
1108
#endif