init_64.c 26.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
T
Thomas Gleixner 已提交
21
#include <linux/initrd.h>
L
Linus Torvalds 已提交
22 23 24
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
25
#include <linux/pci.h>
26
#include <linux/pfn.h>
27
#include <linux/poison.h>
28
#include <linux/dma-mapping.h>
29 30
#include <linux/module.h>
#include <linux/memory_hotplug.h>
31
#include <linux/nmi.h>
L
Linus Torvalds 已提交
32 33 34 35 36 37 38 39 40 41 42 43 44 45

#include <asm/processor.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
46
#include <asm/sections.h>
47
#include <asm/kdebug.h>
48
#include <asm/numa.h>
49
#include <asm/cacheflush.h>
L
Linus Torvalds 已提交
50

51 52 53 54 55
/*
 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
 * The direct mapping extends to max_pfn_mapped, so that we can directly access
 * apertures, ACPI and other tables without having to play with fixmaps.
 */
56
unsigned long max_low_pfn_mapped;
57 58
unsigned long max_pfn_mapped;

59 60
static unsigned long dma_reserve __initdata;

L
Linus Torvalds 已提交
61 62
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);

I
Ingo Molnar 已提交
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
int direct_gbpages __meminitdata
#ifdef CONFIG_DIRECT_GBPAGES
				= 1
#endif
;

static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
83 84 85 86 87 88 89 90
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

int after_bootmem;

91
static __init void *spp_getpage(void)
T
Thomas Gleixner 已提交
92
{
L
Linus Torvalds 已提交
93
	void *ptr;
T
Thomas Gleixner 已提交
94

L
Linus Torvalds 已提交
95
	if (after_bootmem)
T
Thomas Gleixner 已提交
96
		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
L
Linus Torvalds 已提交
97 98
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
99 100 101 102 103

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
104

105
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
106

L
Linus Torvalds 已提交
107
	return ptr;
T
Thomas Gleixner 已提交
108
}
L
Linus Torvalds 已提交
109

110
void
111
set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
L
Linus Torvalds 已提交
112 113 114
{
	pud_t *pud;
	pmd_t *pmd;
115
	pte_t *pte;
L
Linus Torvalds 已提交
116

117
	pud = pud_page + pud_index(vaddr);
L
Linus Torvalds 已提交
118
	if (pud_none(*pud)) {
T
Thomas Gleixner 已提交
119
		pmd = (pmd_t *) spp_getpage();
120
		pud_populate(&init_mm, pud, pmd);
L
Linus Torvalds 已提交
121
		if (pmd != pmd_offset(pud, 0)) {
122
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
T
Thomas Gleixner 已提交
123
				pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
124 125 126 127 128 129
			return;
		}
	}
	pmd = pmd_offset(pud, vaddr);
	if (pmd_none(*pmd)) {
		pte = (pte_t *) spp_getpage();
130
		pmd_populate_kernel(&init_mm, pmd, pte);
L
Linus Torvalds 已提交
131
		if (pte != pte_offset_kernel(pmd, 0)) {
132
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
133 134 135 136 137
			return;
		}
	}

	pte = pte_offset_kernel(pmd, vaddr);
I
Ingo Molnar 已提交
138
	if (!pte_none(*pte) && pte_val(new_pte) &&
L
Linus Torvalds 已提交
139 140 141 142 143 144 145 146 147 148 149
	    pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
		pte_ERROR(*pte);
	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
void
set_pte_vaddr(unsigned long vaddr, pte_t pteval)
{
	pgd_t *pgd;
	pud_t *pud_page;

	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
		return;
	}
	pud_page = (pud_t*)pgd_page_vaddr(*pgd);
	set_pte_vaddr_pud(pud_page, vaddr, pteval);
}

168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
/*
 * Create large page table mappings for a range of physical addresses.
 */
static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
						pgprot_t prot)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
		pgd = pgd_offset_k((unsigned long)__va(phys));
		if (pgd_none(*pgd)) {
			pud = (pud_t *) spp_getpage();
			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pud = pud_offset(pgd, (unsigned long)__va(phys));
		if (pud_none(*pud)) {
			pmd = (pmd_t *) spp_getpage();
			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pmd = pmd_offset(pud, phys);
		BUG_ON(!pmd_none(*pmd));
		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
	}
}

void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
}

void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
}

208
/*
209 210 211
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
 * We limit the mappings to the region from _text to _end.  _end is
 * rounded up to the 2MB boundary. This catches the invalid pmds as
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
	unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
	pmd_t *pmd = level2_kernel_pgt;
	pmd_t *last_pmd = pmd + PTRS_PER_PMD;

	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
229
		if (pmd_none(*pmd))
230 231 232 233 234 235
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

236 237
static unsigned long __initdata table_start;
static unsigned long __meminitdata table_end;
238
static unsigned long __meminitdata table_top;
L
Linus Torvalds 已提交
239

240
static __meminit void *alloc_low_page(unsigned long *phys)
T
Thomas Gleixner 已提交
241
{
242
	unsigned long pfn = table_end++;
L
Linus Torvalds 已提交
243 244
	void *adr;

245 246 247
	if (after_bootmem) {
		adr = (void *)get_zeroed_page(GFP_ATOMIC);
		*phys = __pa(adr);
T
Thomas Gleixner 已提交
248

249 250 251
		return adr;
	}

252
	if (pfn >= table_top)
T
Thomas Gleixner 已提交
253
		panic("alloc_low_page: ran out of memory");
254 255

	adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
256
	memset(adr, 0, PAGE_SIZE);
257 258 259
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
L
Linus Torvalds 已提交
260

261
static __meminit void unmap_low_page(void *adr)
T
Thomas Gleixner 已提交
262
{
263 264 265
	if (after_bootmem)
		return;

266
	early_iounmap(adr, PAGE_SIZE);
T
Thomas Gleixner 已提交
267
}
L
Linus Torvalds 已提交
268

269
static unsigned long __meminit
270 271 272
phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
{
	unsigned pages = 0;
273
	unsigned long last_map_addr = end;
274
	int i;
275

276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294
	pte_t *pte = pte_page + pte_index(addr);

	for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {

		if (addr >= end) {
			if (!after_bootmem) {
				for(; i < PTRS_PER_PTE; i++, pte++)
					set_pte(pte, __pte(0));
			}
			break;
		}

		if (pte_val(*pte))
			continue;

		if (0)
			printk("   pte=%p addr=%lx pte=%016lx\n",
			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
295
		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
296 297 298
		pages++;
	}
	update_page_count(PG_LEVEL_4K, pages);
299 300

	return last_map_addr;
301 302
}

303
static unsigned long __meminit
304 305 306 307
phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
{
	pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);

308
	return phys_pte_init(pte, address, end);
309 310
}

311
static unsigned long __meminit
312 313
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
			 unsigned long page_size_mask)
314
{
315
	unsigned long pages = 0;
316
	unsigned long last_map_addr = end;
317

318
	int i = pmd_index(address);
319

320
	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
321
		unsigned long pte_phys;
322
		pmd_t *pmd = pmd_page + pmd_index(address);
323
		pte_t *pte;
324

325
		if (address >= end) {
T
Thomas Gleixner 已提交
326
			if (!after_bootmem) {
327 328
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
T
Thomas Gleixner 已提交
329
			}
330 331
			break;
		}
332

333
		if (pmd_val(*pmd)) {
334
			if (!pmd_large(*pmd))
335 336
				last_map_addr = phys_pte_update(pmd, address,
								 end);
337 338 339
			continue;
		}

340
		if (page_size_mask & (1<<PG_LEVEL_2M)) {
341 342 343
			pages++;
			set_pte((pte_t *)pmd,
				pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
344
			last_map_addr = (address & PMD_MASK) + PMD_SIZE;
345
			continue;
346
		}
347

348
		pte = alloc_low_page(&pte_phys);
349
		last_map_addr = phys_pte_init(pte, address, end);
350 351 352
		unmap_low_page(pte);

		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
353
	}
354
	update_page_count(PG_LEVEL_2M, pages);
355
	return last_map_addr;
356 357
}

358
static unsigned long __meminit
359 360
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
			 unsigned long page_size_mask)
361
{
T
Thomas Gleixner 已提交
362
	pmd_t *pmd = pmd_offset(pud, 0);
363 364
	unsigned long last_map_addr;

365
	spin_lock(&init_mm.page_table_lock);
366
	last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
367 368
	spin_unlock(&init_mm.page_table_lock);
	__flush_tlb_all();
369
	return last_map_addr;
370 371
}

372
static unsigned long __meminit
373 374
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
			 unsigned long page_size_mask)
T
Thomas Gleixner 已提交
375
{
376
	unsigned long pages = 0;
377
	unsigned long last_map_addr = end;
378
	int i = pud_index(addr);
379

T
Thomas Gleixner 已提交
380
	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
381 382
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
383 384
		pmd_t *pmd;

385
		if (addr >= end)
L
Linus Torvalds 已提交
386 387
			break;

T
Thomas Gleixner 已提交
388 389 390
		if (!after_bootmem &&
				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
			set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
391
			continue;
T
Thomas Gleixner 已提交
392
		}
L
Linus Torvalds 已提交
393

394
		if (pud_val(*pud)) {
395
			if (!pud_large(*pud))
396 397
				last_map_addr = phys_pmd_update(pud, addr, end,
							 page_size_mask);
398 399 400
			continue;
		}

401
		if (page_size_mask & (1<<PG_LEVEL_1G)) {
402
			pages++;
403 404
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
405
			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
406 407 408
			continue;
		}

409
		pmd = alloc_low_page(&pmd_phys);
T
Thomas Gleixner 已提交
410

411
		spin_lock(&init_mm.page_table_lock);
412
		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
413 414
		unmap_low_page(pmd);
		pud_populate(&init_mm, pud, __va(pmd_phys));
415
		spin_unlock(&init_mm.page_table_lock);
T
Thomas Gleixner 已提交
416

L
Linus Torvalds 已提交
417
	}
A
Andi Kleen 已提交
418
	__flush_tlb_all();
419
	update_page_count(PG_LEVEL_1G, pages);
420

421
	return last_map_addr;
T
Thomas Gleixner 已提交
422
}
L
Linus Torvalds 已提交
423

424
static unsigned long __meminit
425 426
phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
		 unsigned long page_size_mask)
427 428 429 430 431
{
	pud_t *pud;

	pud = (pud_t *)pgd_page_vaddr(*pgd);

432
	return phys_pud_init(pud, addr, end, page_size_mask);
433 434
}

L
Linus Torvalds 已提交
435 436
static void __init find_early_table_space(unsigned long end)
{
437
	unsigned long puds, pmds, ptes, tables, start;
L
Linus Torvalds 已提交
438 439

	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
440
	tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
441 442 443 444 445 446 447 448 449 450 451 452 453 454 455
	if (direct_gbpages) {
		unsigned long extra;
		extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
		pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
	} else
		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
	tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);

	if (cpu_has_pse) {
		unsigned long extra;
		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
	} else
		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
	tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
L
Linus Torvalds 已提交
456

T
Thomas Gleixner 已提交
457 458 459 460 461 462
	/*
	 * RED-PEN putting page tables only on node 0 could
	 * cause a hotspot and fill up ZONE_DMA. The page tables
	 * need roughly 0.5KB per GB.
	 */
	start = 0x8000;
463
	table_start = find_e820_area(start, end, tables, PAGE_SIZE);
L
Linus Torvalds 已提交
464 465 466 467 468
	if (table_start == -1UL)
		panic("Cannot find space for the kernel page tables");

	table_start >>= PAGE_SHIFT;
	table_end = table_start;
469
	table_top = table_start + (tables >> PAGE_SHIFT);
470

471 472
	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
		end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
L
Linus Torvalds 已提交
473 474
}

475 476 477 478 479 480 481 482
static void __init init_gbpages(void)
{
	if (direct_gbpages && cpu_has_gbpages)
		printk(KERN_INFO "Using GB pages for direct mapping\n");
	else
		direct_gbpages = 0;
}

483 484 485
static unsigned long __init kernel_physical_mapping_init(unsigned long start,
						unsigned long end,
						unsigned long page_size_mask)
T
Thomas Gleixner 已提交
486
{
L
Linus Torvalds 已提交
487

488
	unsigned long next, last_map_addr = end;
L
Linus Torvalds 已提交
489 490 491 492 493

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);

	for (; start < end; start = next) {
494
		pgd_t *pgd = pgd_offset_k(start);
T
Thomas Gleixner 已提交
495
		unsigned long pud_phys;
496 497
		pud_t *pud;

498
		next = (start + PGDIR_SIZE) & PGDIR_MASK;
499 500 501 502
		if (next > end)
			next = end;

		if (pgd_val(*pgd)) {
503 504
			last_map_addr = phys_pud_update(pgd, __pa(start),
						 __pa(end), page_size_mask);
505 506 507
			continue;
		}

508
		if (after_bootmem)
509
			pud = pud_offset(pgd, start & PGDIR_MASK);
510
		else
511
			pud = alloc_low_page(&pud_phys);
512

513 514
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
						 page_size_mask);
515
		unmap_low_page(pud);
516 517
		pgd_populate(&init_mm, pgd_offset_k(start),
			     __va(pud_phys));
T
Thomas Gleixner 已提交
518
	}
L
Linus Torvalds 已提交
519

520 521
	return last_map_addr;
}
522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547

struct map_range {
	unsigned long start;
	unsigned long end;
	unsigned page_size_mask;
};

#define NR_RANGE_MR 5

static int save_mr(struct map_range *mr, int nr_range,
		   unsigned long start_pfn, unsigned long end_pfn,
		   unsigned long page_size_mask)
{

	if (start_pfn < end_pfn) {
		if (nr_range >= NR_RANGE_MR)
			panic("run out of range for init_memory_mapping\n");
		mr[nr_range].start = start_pfn<<PAGE_SHIFT;
		mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
		mr[nr_range].page_size_mask = page_size_mask;
		nr_range++;
	}

	return nr_range;
}

548 549 550 551 552 553 554 555
/*
 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
 * This runs before bootmem is initialized and gets pages directly from
 * the physical memory. To access them they are temporarily mapped.
 */
unsigned long __init_refok init_memory_mapping(unsigned long start,
					       unsigned long end)
{
556
	unsigned long last_map_addr = 0;
557
	unsigned long page_size_mask = 0;
558
	unsigned long start_pfn, end_pfn;
559

560 561 562
	struct map_range mr[NR_RANGE_MR];
	int nr_range, i;

563 564 565 566 567 568 569 570 571
	printk(KERN_INFO "init_memory_mapping\n");

	/*
	 * Find space for the kernel direct mapping tables.
	 *
	 * Later we should allocate these tables in the local node of the
	 * memory mapped. Unfortunately this is done currently before the
	 * nodes are discovered.
	 */
572
	if (!after_bootmem)
573 574 575 576 577 578 579
		init_gbpages();

	if (direct_gbpages)
		page_size_mask |= 1 << PG_LEVEL_1G;
	if (cpu_has_pse)
		page_size_mask |= 1 << PG_LEVEL_2M;

580 581 582 583
	memset(mr, 0, sizeof(mr));
	nr_range = 0;

	/* head if not big page alignment ?*/
584 585 586
	start_pfn = start >> PAGE_SHIFT;
	end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
			<< (PMD_SHIFT - PAGE_SHIFT);
587
	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
588 589 590 591 592 593 594 595

	/* big page (2M) range*/
	start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
			 << (PMD_SHIFT - PAGE_SHIFT);
	end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
			 << (PUD_SHIFT - PAGE_SHIFT);
	if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
		end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
596 597
	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
			page_size_mask & (1<<PG_LEVEL_2M));
598 599 600 601

	/* big page (1G) range */
	start_pfn = end_pfn;
	end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
602 603 604
	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
				page_size_mask &
				 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
605 606 607 608

	/* tail is not big page (1G) alignment */
	start_pfn = end_pfn;
	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
609 610 611
	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
			page_size_mask & (1<<PG_LEVEL_2M));

612 613 614
	/* tail is not big page (2M) alignment */
	start_pfn = end_pfn;
	end_pfn = end>>PAGE_SHIFT;
615 616
	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);

Y
Yinghai Lu 已提交
617 618 619 620 621 622 623 624 625 626 627 628 629 630
	/* try to merge same page size and continuous */
	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
		unsigned long old_start;
		if (mr[i].end != mr[i+1].start ||
		    mr[i].page_size_mask != mr[i+1].page_size_mask)
			continue;
		/* move it */
		old_start = mr[i].start;
		memmove(&mr[i], &mr[i+1],
			 (nr_range - 1 - i) * sizeof (struct map_range));
		mr[i].start = old_start;
		nr_range--;
	}

631 632 633 634 635 636 637 638 639 640
	for (i = 0; i < nr_range; i++)
		printk(KERN_DEBUG " %010lx - %010lx page %s\n",
				mr[i].start, mr[i].end,
			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));

	if (!after_bootmem)
		find_early_table_space(end);

	for (i = 0; i < nr_range; i++)
641
		last_map_addr = kernel_physical_mapping_init(
642 643
					mr[i].start, mr[i].end,
					mr[i].page_size_mask);
644

645
	if (!after_bootmem)
646
		mmu_cr4_features = read_cr4();
L
Linus Torvalds 已提交
647
	__flush_tlb_all();
648

649
	if (!after_bootmem && table_end > table_start)
650 651
		reserve_early(table_start << PAGE_SHIFT,
				 table_end << PAGE_SHIFT, "PGTABLE");
Y
Yinghai Lu 已提交
652

653 654 655
	printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
			 last_map_addr, end);

Y
Yinghai Lu 已提交
656
	if (!after_bootmem)
657
		early_memtest(start, end);
658

659
	return last_map_addr >> PAGE_SHIFT;
L
Linus Torvalds 已提交
660 661
}

662
#ifndef CONFIG_NUMA
663 664 665 666 667 668 669 670 671
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long bootmap_size, bootmap;

	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
				 PAGE_SIZE);
	if (bootmap == -1L)
		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
Y
Yinghai Lu 已提交
672 673 674
	/* don't touch min_low_pfn */
	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
					 0, end_pfn);
675 676 677 678 679 680
	e820_register_active_regions(0, start_pfn, end_pfn);
	free_bootmem_with_active_regions(0, end_pfn);
	early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
	reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
}

L
Linus Torvalds 已提交
681 682
void __init paging_init(void)
{
683
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
684

685 686 687
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
Y
Yinghai Lu 已提交
688
	max_zone_pfns[ZONE_NORMAL] = max_pfn;
689

Y
Yinghai Lu 已提交
690
	memory_present(0, 0, max_pfn);
691
	sparse_init();
692
	free_area_init_nodes(max_zone_pfns);
L
Linus Torvalds 已提交
693 694 695
}
#endif

696 697 698
/*
 * Memory hotplug specific functions
 */
699
#ifdef CONFIG_MEMORY_HOTPLUG
700 701 702 703
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
704
int arch_add_memory(int nid, u64 start, u64 size)
705
{
706
	struct pglist_data *pgdat = NODE_DATA(nid);
707
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
708
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
709 710 711
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

712 713 714
	last_mapped_pfn = init_memory_mapping(start, start + size-1);
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
715

716
	ret = __add_pages(zone, start_pfn, nr_pages);
717
	WARN_ON(1);
718 719 720

	return ret;
}
721
EXPORT_SYMBOL_GPL(arch_add_memory);
722

723
#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
724 725 726 727
int memory_add_physaddr_to_nid(u64 start)
{
	return 0;
}
728
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
729 730
#endif

731 732
#endif /* CONFIG_MEMORY_HOTPLUG */

733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752
/*
 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
 * is valid. The argument is a physical page number.
 *
 *
 * On x86, access has to be given to the first megabyte of ram because that area
 * contains bios code and data regions used by X and dosemu and similar apps.
 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
 * mmio resources as well as potential bios/acpi data regions.
 */
int devmem_is_allowed(unsigned long pagenr)
{
	if (pagenr <= 256)
		return 1;
	if (!page_is_ram(pagenr))
		return 1;
	return 0;
}


T
Thomas Gleixner 已提交
753 754
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
			 kcore_modules, kcore_vsyscall;
L
Linus Torvalds 已提交
755 756 757

void __init mem_init(void)
{
758
	long codesize, reservedpages, datasize, initsize;
L
Linus Torvalds 已提交
759

760
	pci_iommu_alloc();
L
Linus Torvalds 已提交
761

762
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
763 764 765 766

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
767
#ifdef CONFIG_NUMA
768
	totalram_pages = numa_free_all_bootmem();
L
Linus Torvalds 已提交
769
#else
770
	totalram_pages = free_all_bootmem();
L
Linus Torvalds 已提交
771
#endif
Y
Yinghai Lu 已提交
772 773
	reservedpages = max_pfn - totalram_pages -
					absent_pages_in_range(0, max_pfn);
L
Linus Torvalds 已提交
774 775 776 777 778 779 780
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
781 782
	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
L
Linus Torvalds 已提交
783 784 785
		   VMALLOC_END-VMALLOC_START);
	kclist_add(&kcore_kernel, &_stext, _end - _stext);
	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
T
Thomas Gleixner 已提交
786
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
L
Linus Torvalds 已提交
787 788
				 VSYSCALL_END - VSYSCALL_START);

789
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
T
Thomas Gleixner 已提交
790
				"%ldk reserved, %ldk data, %ldk init)\n",
L
Linus Torvalds 已提交
791
		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
Y
Yinghai Lu 已提交
792
		max_pfn << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
793 794 795 796
		codesize >> 10,
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
797 798

	cpa_init();
L
Linus Torvalds 已提交
799 800
}

801
void free_init_pages(char *what, unsigned long begin, unsigned long end)
L
Linus Torvalds 已提交
802
{
803
	unsigned long addr = begin;
L
Linus Torvalds 已提交
804

805
	if (addr >= end)
806 807
		return;

I
Ingo Molnar 已提交
808 809 810 811 812 813 814 815 816 817
	/*
	 * If debugging page accesses then do not free this memory but
	 * mark them not present - any buggy init-section access will
	 * create a kernel page fault:
	 */
#ifdef CONFIG_DEBUG_PAGEALLOC
	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
		begin, PAGE_ALIGN(end));
	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
#else
818
	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
T
Thomas Gleixner 已提交
819

820
	for (; addr < end; addr += PAGE_SIZE) {
821 822 823 824 825
		ClearPageReserved(virt_to_page(addr));
		init_page_count(virt_to_page(addr));
		memset((void *)(addr & ~(PAGE_SIZE-1)),
			POISON_FREE_INITMEM, PAGE_SIZE);
		free_page(addr);
L
Linus Torvalds 已提交
826 827
		totalram_pages++;
	}
I
Ingo Molnar 已提交
828
#endif
829 830 831 832 833
}

void free_initmem(void)
{
	free_init_pages("unused kernel memory",
834 835
			(unsigned long)(&__init_begin),
			(unsigned long)(&__init_end));
L
Linus Torvalds 已提交
836 837
}

838
#ifdef CONFIG_DEBUG_RODATA
839 840
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
841 842 843

void mark_rodata_ro(void)
{
844
	unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
845 846 847 848 849 850 851
	unsigned long rodata_start =
		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;

#ifdef CONFIG_DYNAMIC_FTRACE
	/* Dynamic tracing modifies the kernel text section */
	start = rodata_start;
#endif
852

853
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
854
	       (end - start) >> 10);
855 856 857 858 859 860
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
861
	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
862

863 864
	rodata_test();

865
#ifdef CONFIG_CPA_DEBUG
866
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
867
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
868

869
	printk(KERN_INFO "Testing CPA: again\n");
870
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
871
#endif
872
}
873

874 875
#endif

L
Linus Torvalds 已提交
876 877 878
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
879
	free_init_pages("initrd memory", start, end);
L
Linus Torvalds 已提交
880 881 882
}
#endif

883 884
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
				   int flags)
T
Thomas Gleixner 已提交
885
{
886
#ifdef CONFIG_NUMA
887
	int nid, next_nid;
888
	int ret;
889 890
#endif
	unsigned long pfn = phys >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
891

Y
Yinghai Lu 已提交
892
	if (pfn >= max_pfn) {
T
Thomas Gleixner 已提交
893 894 895 896
		/*
		 * This can happen with kdump kernels when accessing
		 * firmware tables:
		 */
897
		if (pfn < max_pfn_mapped)
898
			return -EFAULT;
T
Thomas Gleixner 已提交
899

900
		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
901
				phys, len);
902
		return -EFAULT;
903 904 905 906
	}

	/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
907 908 909
	nid = phys_to_nid(phys);
	next_nid = phys_to_nid(phys + len - 1);
	if (nid == next_nid)
910
		ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
911
	else
912 913 914 915 916
		ret = reserve_bootmem(phys, len, flags);

	if (ret != 0)
		return ret;

T
Thomas Gleixner 已提交
917
#else
918
	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
L
Linus Torvalds 已提交
919
#endif
920

921
	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
922
		dma_reserve += len / PAGE_SIZE;
923 924
		set_dma_reserve(dma_reserve);
	}
925 926

	return 0;
L
Linus Torvalds 已提交
927 928
}

T
Thomas Gleixner 已提交
929 930
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
931
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
932 933 934 935
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
936 937

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
938 939
		return 0;

L
Linus Torvalds 已提交
940 941 942 943 944 945
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
946
		return 0;
L
Linus Torvalds 已提交
947 948 949 950

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
951

L
Linus Torvalds 已提交
952 953 954 955 956 957
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
958

L
Linus Torvalds 已提交
959 960 961
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
962 963 964 965 966
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
967
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
968 969 970 971
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
972 973 974 975 976
};

struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
#ifdef CONFIG_IA32_EMULATION
977 978
	if (test_tsk_thread_flag(tsk, TIF_IA32))
		return NULL;
L
Linus Torvalds 已提交
979 980 981 982 983 984 985
#endif
	return &gate_vma;
}

int in_gate_area(struct task_struct *task, unsigned long addr)
{
	struct vm_area_struct *vma = get_gate_vma(task);
T
Thomas Gleixner 已提交
986

987 988
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
989

L
Linus Torvalds 已提交
990 991 992
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
993 994 995 996
/*
 * Use this when you have no reliable task/vma, typically from interrupt
 * context. It is less reliable than using the task's vma and may give
 * false positives:
L
Linus Torvalds 已提交
997 998 999
 */
int in_gate_area_no_task(unsigned long addr)
{
1000
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
1001
}
1002

1003 1004 1005 1006 1007 1008 1009 1010
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
1011 1012 1013 1014 1015

#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
1016 1017 1018 1019
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

T
Thomas Gleixner 已提交
1020 1021
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
1022 1023 1024 1025 1026 1027 1028 1029 1030
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
1031
		void *p = NULL;
1032 1033 1034 1035

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
1036

1037 1038 1039 1040
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

1041 1042 1043 1044 1045 1046 1047 1048
		if (!cpu_has_pse) {
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			pmd = vmemmap_pmd_populate(pud, addr, node);

			if (!pmd)
				return -ENOMEM;

			p = vmemmap_pte_populate(pmd, addr, node);
T
Thomas Gleixner 已提交
1049

1050 1051 1052
			if (!p)
				return -ENOMEM;

1053 1054
			addr_end = addr + PAGE_SIZE;
			p_end = p + PAGE_SIZE;
T
Thomas Gleixner 已提交
1055
		} else {
1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078
			next = pmd_addr_end(addr, end);

			pmd = pmd_offset(pud, addr);
			if (pmd_none(*pmd)) {
				pte_t entry;

				p = vmemmap_alloc_block(PMD_SIZE, node);
				if (!p)
					return -ENOMEM;

				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
						PAGE_KERNEL_LARGE);
				set_pmd(pmd, __pmd(pte_val(entry)));

				/* check to see if we have contiguous blocks */
				if (p_end != p || node_start != node) {
					if (p_start)
						printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						       addr_start, addr_end-1, p_start, p_end-1, node_start);
					addr_start = addr;
					node_start = node;
					p_start = p;
				}
Y
Yinghai Lu 已提交
1079 1080 1081

				addr_end = addr + PMD_SIZE;
				p_end = p + PMD_SIZE;
1082 1083
			} else
				vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
1084
		}
1085

1086 1087 1088
	}
	return 0;
}
1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099

void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
1100
#endif