init_64.c 27.3 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
T
Thomas Gleixner 已提交
21
#include <linux/initrd.h>
L
Linus Torvalds 已提交
22 23 24
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
25
#include <linux/pci.h>
26
#include <linux/pfn.h>
27
#include <linux/poison.h>
28
#include <linux/dma-mapping.h>
29 30
#include <linux/module.h>
#include <linux/memory_hotplug.h>
31
#include <linux/nmi.h>
L
Linus Torvalds 已提交
32 33 34 35 36 37 38 39 40 41 42 43 44 45

#include <asm/processor.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
46
#include <asm/sections.h>
47
#include <asm/kdebug.h>
48
#include <asm/numa.h>
49
#include <asm/cacheflush.h>
L
Linus Torvalds 已提交
50

51 52 53 54 55
/*
 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
 * The direct mapping extends to max_pfn_mapped, so that we can directly access
 * apertures, ACPI and other tables without having to play with fixmaps.
 */
56
unsigned long max_low_pfn_mapped;
57 58
unsigned long max_pfn_mapped;

59 60
static unsigned long dma_reserve __initdata;

L
Linus Torvalds 已提交
61 62
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);

H
Hugh Dickins 已提交
63
int direct_gbpages
I
Ingo Molnar 已提交
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
#ifdef CONFIG_DIRECT_GBPAGES
				= 1
#endif
;

static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
83 84 85 86 87 88 89 90
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

int after_bootmem;

91 92 93 94 95
/*
 * NOTE: This function is marked __ref because it calls __init function
 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
 */
static __ref void *spp_getpage(void)
T
Thomas Gleixner 已提交
96
{
L
Linus Torvalds 已提交
97
	void *ptr;
T
Thomas Gleixner 已提交
98

L
Linus Torvalds 已提交
99
	if (after_bootmem)
T
Thomas Gleixner 已提交
100
		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
L
Linus Torvalds 已提交
101 102
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
103 104 105 106 107

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
108

109
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
110

L
Linus Torvalds 已提交
111
	return ptr;
T
Thomas Gleixner 已提交
112
}
L
Linus Torvalds 已提交
113

114
void
115
set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
L
Linus Torvalds 已提交
116 117 118
{
	pud_t *pud;
	pmd_t *pmd;
119
	pte_t *pte;
L
Linus Torvalds 已提交
120

121
	pud = pud_page + pud_index(vaddr);
L
Linus Torvalds 已提交
122
	if (pud_none(*pud)) {
T
Thomas Gleixner 已提交
123
		pmd = (pmd_t *) spp_getpage();
124
		pud_populate(&init_mm, pud, pmd);
L
Linus Torvalds 已提交
125
		if (pmd != pmd_offset(pud, 0)) {
126
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
T
Thomas Gleixner 已提交
127
				pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
128 129 130 131 132 133
			return;
		}
	}
	pmd = pmd_offset(pud, vaddr);
	if (pmd_none(*pmd)) {
		pte = (pte_t *) spp_getpage();
134
		pmd_populate_kernel(&init_mm, pmd, pte);
L
Linus Torvalds 已提交
135
		if (pte != pte_offset_kernel(pmd, 0)) {
136
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
137 138 139 140 141
			return;
		}
	}

	pte = pte_offset_kernel(pmd, vaddr);
I
Ingo Molnar 已提交
142
	if (!pte_none(*pte) && pte_val(new_pte) &&
L
Linus Torvalds 已提交
143 144 145 146 147 148 149 150 151 152 153
	    pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
		pte_ERROR(*pte);
	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
void
set_pte_vaddr(unsigned long vaddr, pte_t pteval)
{
	pgd_t *pgd;
	pud_t *pud_page;

	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
		return;
	}
	pud_page = (pud_t*)pgd_page_vaddr(*pgd);
	set_pte_vaddr_pud(pud_page, vaddr, pteval);
}

172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211
/*
 * Create large page table mappings for a range of physical addresses.
 */
static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
						pgprot_t prot)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
		pgd = pgd_offset_k((unsigned long)__va(phys));
		if (pgd_none(*pgd)) {
			pud = (pud_t *) spp_getpage();
			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pud = pud_offset(pgd, (unsigned long)__va(phys));
		if (pud_none(*pud)) {
			pmd = (pmd_t *) spp_getpage();
			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pmd = pmd_offset(pud, phys);
		BUG_ON(!pmd_none(*pmd));
		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
	}
}

void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
}

void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
}

212
/*
213 214 215
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
 * We limit the mappings to the region from _text to _end.  _end is
 * rounded up to the 2MB boundary. This catches the invalid pmds as
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
	unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
	pmd_t *pmd = level2_kernel_pgt;
	pmd_t *last_pmd = pmd + PTRS_PER_PMD;

	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
233
		if (pmd_none(*pmd))
234 235 236 237 238 239
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

240 241
static unsigned long __initdata table_start;
static unsigned long __meminitdata table_end;
242
static unsigned long __meminitdata table_top;
L
Linus Torvalds 已提交
243

244
static __ref void *alloc_low_page(unsigned long *phys)
T
Thomas Gleixner 已提交
245
{
246
	unsigned long pfn = table_end++;
L
Linus Torvalds 已提交
247 248
	void *adr;

249 250 251
	if (after_bootmem) {
		adr = (void *)get_zeroed_page(GFP_ATOMIC);
		*phys = __pa(adr);
T
Thomas Gleixner 已提交
252

253 254 255
		return adr;
	}

256
	if (pfn >= table_top)
T
Thomas Gleixner 已提交
257
		panic("alloc_low_page: ran out of memory");
258 259

	adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
260
	memset(adr, 0, PAGE_SIZE);
261 262 263
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
L
Linus Torvalds 已提交
264

265
static __ref void unmap_low_page(void *adr)
T
Thomas Gleixner 已提交
266
{
267 268 269
	if (after_bootmem)
		return;

270
	early_iounmap(adr, PAGE_SIZE);
T
Thomas Gleixner 已提交
271
}
L
Linus Torvalds 已提交
272

273
static unsigned long __meminit
274 275 276
phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
{
	unsigned pages = 0;
277
	unsigned long last_map_addr = end;
278
	int i;
279

280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298
	pte_t *pte = pte_page + pte_index(addr);

	for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {

		if (addr >= end) {
			if (!after_bootmem) {
				for(; i < PTRS_PER_PTE; i++, pte++)
					set_pte(pte, __pte(0));
			}
			break;
		}

		if (pte_val(*pte))
			continue;

		if (0)
			printk("   pte=%p addr=%lx pte=%016lx\n",
			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
299
		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
300 301 302
		pages++;
	}
	update_page_count(PG_LEVEL_4K, pages);
303 304

	return last_map_addr;
305 306
}

307
static unsigned long __meminit
308 309 310 311
phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
{
	pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);

312
	return phys_pte_init(pte, address, end);
313 314
}

315
static unsigned long __meminit
316 317
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
			 unsigned long page_size_mask)
318
{
319
	unsigned long pages = 0;
320
	unsigned long last_map_addr = end;
H
Hugh Dickins 已提交
321
	unsigned long start = address;
322

323
	int i = pmd_index(address);
324

325
	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
326
		unsigned long pte_phys;
327
		pmd_t *pmd = pmd_page + pmd_index(address);
328
		pte_t *pte;
329

330
		if (address >= end) {
T
Thomas Gleixner 已提交
331
			if (!after_bootmem) {
332 333
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
T
Thomas Gleixner 已提交
334
			}
335 336
			break;
		}
337

338
		if (pmd_val(*pmd)) {
339 340
			if (!pmd_large(*pmd)) {
				spin_lock(&init_mm.page_table_lock);
341
				last_map_addr = phys_pte_update(pmd, address,
342 343 344
								end);
				spin_unlock(&init_mm.page_table_lock);
			}
H
Hugh Dickins 已提交
345 346 347
			/* Count entries we're using from level2_ident_pgt */
			if (start == 0)
				pages++;
348 349 350
			continue;
		}

351
		if (page_size_mask & (1<<PG_LEVEL_2M)) {
352
			pages++;
353
			spin_lock(&init_mm.page_table_lock);
354 355
			set_pte((pte_t *)pmd,
				pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
356
			spin_unlock(&init_mm.page_table_lock);
357
			last_map_addr = (address & PMD_MASK) + PMD_SIZE;
358
			continue;
359
		}
360

361
		pte = alloc_low_page(&pte_phys);
362
		last_map_addr = phys_pte_init(pte, address, end);
363 364
		unmap_low_page(pte);

365
		spin_lock(&init_mm.page_table_lock);
366
		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
367
		spin_unlock(&init_mm.page_table_lock);
368
	}
369
	update_page_count(PG_LEVEL_2M, pages);
370
	return last_map_addr;
371 372
}

373
static unsigned long __meminit
374 375
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
			 unsigned long page_size_mask)
376
{
T
Thomas Gleixner 已提交
377
	pmd_t *pmd = pmd_offset(pud, 0);
378 379
	unsigned long last_map_addr;

380
	last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
381
	__flush_tlb_all();
382
	return last_map_addr;
383 384
}

385
static unsigned long __meminit
386 387
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
			 unsigned long page_size_mask)
T
Thomas Gleixner 已提交
388
{
389
	unsigned long pages = 0;
390
	unsigned long last_map_addr = end;
391
	int i = pud_index(addr);
392

T
Thomas Gleixner 已提交
393
	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
394 395
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
396 397
		pmd_t *pmd;

398
		if (addr >= end)
L
Linus Torvalds 已提交
399 400
			break;

T
Thomas Gleixner 已提交
401 402 403
		if (!after_bootmem &&
				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
			set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
404
			continue;
T
Thomas Gleixner 已提交
405
		}
L
Linus Torvalds 已提交
406

407
		if (pud_val(*pud)) {
408
			if (!pud_large(*pud))
409 410
				last_map_addr = phys_pmd_update(pud, addr, end,
							 page_size_mask);
411 412 413
			continue;
		}

414
		if (page_size_mask & (1<<PG_LEVEL_1G)) {
415
			pages++;
416
			spin_lock(&init_mm.page_table_lock);
417 418
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
419
			spin_unlock(&init_mm.page_table_lock);
420
			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
421 422 423
			continue;
		}

424
		pmd = alloc_low_page(&pmd_phys);
425
		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
426
		unmap_low_page(pmd);
427 428

		spin_lock(&init_mm.page_table_lock);
429
		pud_populate(&init_mm, pud, __va(pmd_phys));
430
		spin_unlock(&init_mm.page_table_lock);
L
Linus Torvalds 已提交
431
	}
A
Andi Kleen 已提交
432
	__flush_tlb_all();
433
	update_page_count(PG_LEVEL_1G, pages);
434

435
	return last_map_addr;
T
Thomas Gleixner 已提交
436
}
L
Linus Torvalds 已提交
437

438
static unsigned long __meminit
439 440
phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
		 unsigned long page_size_mask)
441 442 443 444 445
{
	pud_t *pud;

	pud = (pud_t *)pgd_page_vaddr(*pgd);

446
	return phys_pud_init(pud, addr, end, page_size_mask);
447 448
}

L
Linus Torvalds 已提交
449 450
static void __init find_early_table_space(unsigned long end)
{
451
	unsigned long puds, pmds, ptes, tables, start;
L
Linus Torvalds 已提交
452 453

	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
454
	tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
455 456 457 458 459 460 461 462 463 464 465 466 467 468 469
	if (direct_gbpages) {
		unsigned long extra;
		extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
		pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
	} else
		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
	tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);

	if (cpu_has_pse) {
		unsigned long extra;
		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
	} else
		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
	tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
L
Linus Torvalds 已提交
470

T
Thomas Gleixner 已提交
471 472 473 474 475 476
	/*
	 * RED-PEN putting page tables only on node 0 could
	 * cause a hotspot and fill up ZONE_DMA. The page tables
	 * need roughly 0.5KB per GB.
	 */
	start = 0x8000;
477
	table_start = find_e820_area(start, end, tables, PAGE_SIZE);
L
Linus Torvalds 已提交
478 479 480 481 482
	if (table_start == -1UL)
		panic("Cannot find space for the kernel page tables");

	table_start >>= PAGE_SHIFT;
	table_end = table_start;
483
	table_top = table_start + (tables >> PAGE_SHIFT);
484

485 486
	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
		end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
L
Linus Torvalds 已提交
487 488
}

489 490 491 492 493 494 495 496
static void __init init_gbpages(void)
{
	if (direct_gbpages && cpu_has_gbpages)
		printk(KERN_INFO "Using GB pages for direct mapping\n");
	else
		direct_gbpages = 0;
}

497 498 499
static unsigned long __init kernel_physical_mapping_init(unsigned long start,
						unsigned long end,
						unsigned long page_size_mask)
T
Thomas Gleixner 已提交
500
{
L
Linus Torvalds 已提交
501

502
	unsigned long next, last_map_addr = end;
L
Linus Torvalds 已提交
503 504 505 506 507

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);

	for (; start < end; start = next) {
508
		pgd_t *pgd = pgd_offset_k(start);
T
Thomas Gleixner 已提交
509
		unsigned long pud_phys;
510 511
		pud_t *pud;

512
		next = (start + PGDIR_SIZE) & PGDIR_MASK;
513 514 515 516
		if (next > end)
			next = end;

		if (pgd_val(*pgd)) {
517 518
			last_map_addr = phys_pud_update(pgd, __pa(start),
						 __pa(end), page_size_mask);
519 520 521
			continue;
		}

522
		pud = alloc_low_page(&pud_phys);
523 524
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
						 page_size_mask);
525
		unmap_low_page(pud);
526 527 528 529

		spin_lock(&init_mm.page_table_lock);
		pgd_populate(&init_mm, pgd, __va(pud_phys));
		spin_unlock(&init_mm.page_table_lock);
T
Thomas Gleixner 已提交
530
	}
L
Linus Torvalds 已提交
531

532 533
	return last_map_addr;
}
534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559

struct map_range {
	unsigned long start;
	unsigned long end;
	unsigned page_size_mask;
};

#define NR_RANGE_MR 5

static int save_mr(struct map_range *mr, int nr_range,
		   unsigned long start_pfn, unsigned long end_pfn,
		   unsigned long page_size_mask)
{

	if (start_pfn < end_pfn) {
		if (nr_range >= NR_RANGE_MR)
			panic("run out of range for init_memory_mapping\n");
		mr[nr_range].start = start_pfn<<PAGE_SHIFT;
		mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
		mr[nr_range].page_size_mask = page_size_mask;
		nr_range++;
	}

	return nr_range;
}

560 561 562 563 564 565 566 567
/*
 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
 * This runs before bootmem is initialized and gets pages directly from
 * the physical memory. To access them they are temporarily mapped.
 */
unsigned long __init_refok init_memory_mapping(unsigned long start,
					       unsigned long end)
{
568
	unsigned long last_map_addr = 0;
569
	unsigned long page_size_mask = 0;
570
	unsigned long start_pfn, end_pfn;
571

572 573 574
	struct map_range mr[NR_RANGE_MR];
	int nr_range, i;

575 576 577 578 579 580 581 582 583
	printk(KERN_INFO "init_memory_mapping\n");

	/*
	 * Find space for the kernel direct mapping tables.
	 *
	 * Later we should allocate these tables in the local node of the
	 * memory mapped. Unfortunately this is done currently before the
	 * nodes are discovered.
	 */
584
	if (!after_bootmem)
585 586 587 588 589 590 591
		init_gbpages();

	if (direct_gbpages)
		page_size_mask |= 1 << PG_LEVEL_1G;
	if (cpu_has_pse)
		page_size_mask |= 1 << PG_LEVEL_2M;

592 593 594 595
	memset(mr, 0, sizeof(mr));
	nr_range = 0;

	/* head if not big page alignment ?*/
596 597 598
	start_pfn = start >> PAGE_SHIFT;
	end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
			<< (PMD_SHIFT - PAGE_SHIFT);
599
	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
600 601 602 603 604 605 606 607

	/* big page (2M) range*/
	start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
			 << (PMD_SHIFT - PAGE_SHIFT);
	end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
			 << (PUD_SHIFT - PAGE_SHIFT);
	if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
		end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
608 609
	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
			page_size_mask & (1<<PG_LEVEL_2M));
610 611 612 613

	/* big page (1G) range */
	start_pfn = end_pfn;
	end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
614 615 616
	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
				page_size_mask &
				 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
617 618 619 620

	/* tail is not big page (1G) alignment */
	start_pfn = end_pfn;
	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
621 622 623
	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
			page_size_mask & (1<<PG_LEVEL_2M));

624 625 626
	/* tail is not big page (2M) alignment */
	start_pfn = end_pfn;
	end_pfn = end>>PAGE_SHIFT;
627 628
	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);

Y
Yinghai Lu 已提交
629 630 631 632 633 634 635 636 637 638 639 640 641 642
	/* try to merge same page size and continuous */
	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
		unsigned long old_start;
		if (mr[i].end != mr[i+1].start ||
		    mr[i].page_size_mask != mr[i+1].page_size_mask)
			continue;
		/* move it */
		old_start = mr[i].start;
		memmove(&mr[i], &mr[i+1],
			 (nr_range - 1 - i) * sizeof (struct map_range));
		mr[i].start = old_start;
		nr_range--;
	}

643 644 645 646 647 648 649 650 651 652
	for (i = 0; i < nr_range; i++)
		printk(KERN_DEBUG " %010lx - %010lx page %s\n",
				mr[i].start, mr[i].end,
			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));

	if (!after_bootmem)
		find_early_table_space(end);

	for (i = 0; i < nr_range; i++)
653
		last_map_addr = kernel_physical_mapping_init(
654 655
					mr[i].start, mr[i].end,
					mr[i].page_size_mask);
656

657
	if (!after_bootmem)
658
		mmu_cr4_features = read_cr4();
L
Linus Torvalds 已提交
659
	__flush_tlb_all();
660

661
	if (!after_bootmem && table_end > table_start)
662 663
		reserve_early(table_start << PAGE_SHIFT,
				 table_end << PAGE_SHIFT, "PGTABLE");
Y
Yinghai Lu 已提交
664

665 666 667
	printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
			 last_map_addr, end);

Y
Yinghai Lu 已提交
668
	if (!after_bootmem)
669
		early_memtest(start, end);
670

671
	return last_map_addr >> PAGE_SHIFT;
L
Linus Torvalds 已提交
672 673
}

674
#ifndef CONFIG_NUMA
675 676 677 678 679 680 681 682 683
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long bootmap_size, bootmap;

	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
				 PAGE_SIZE);
	if (bootmap == -1L)
		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
Y
Yinghai Lu 已提交
684 685 686
	/* don't touch min_low_pfn */
	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
					 0, end_pfn);
687 688 689 690 691 692
	e820_register_active_regions(0, start_pfn, end_pfn);
	free_bootmem_with_active_regions(0, end_pfn);
	early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
	reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
}

L
Linus Torvalds 已提交
693 694
void __init paging_init(void)
{
695
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
696

697 698 699
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
Y
Yinghai Lu 已提交
700
	max_zone_pfns[ZONE_NORMAL] = max_pfn;
701

Y
Yinghai Lu 已提交
702
	memory_present(0, 0, max_pfn);
703
	sparse_init();
704
	free_area_init_nodes(max_zone_pfns);
L
Linus Torvalds 已提交
705 706 707
}
#endif

708 709 710
/*
 * Memory hotplug specific functions
 */
711
#ifdef CONFIG_MEMORY_HOTPLUG
712 713 714 715
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
716
int arch_add_memory(int nid, u64 start, u64 size)
717
{
718
	struct pglist_data *pgdat = NODE_DATA(nid);
719
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
720
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
721 722 723
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

724 725 726
	last_mapped_pfn = init_memory_mapping(start, start + size-1);
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
727

728
	ret = __add_pages(zone, start_pfn, nr_pages);
729
	WARN_ON(1);
730 731 732

	return ret;
}
733
EXPORT_SYMBOL_GPL(arch_add_memory);
734

735
#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
736 737 738 739
int memory_add_physaddr_to_nid(u64 start)
{
	return 0;
}
740
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
741 742
#endif

743 744
#endif /* CONFIG_MEMORY_HOTPLUG */

745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764
/*
 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
 * is valid. The argument is a physical page number.
 *
 *
 * On x86, access has to be given to the first megabyte of ram because that area
 * contains bios code and data regions used by X and dosemu and similar apps.
 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
 * mmio resources as well as potential bios/acpi data regions.
 */
int devmem_is_allowed(unsigned long pagenr)
{
	if (pagenr <= 256)
		return 1;
	if (!page_is_ram(pagenr))
		return 1;
	return 0;
}


T
Thomas Gleixner 已提交
765 766
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
			 kcore_modules, kcore_vsyscall;
L
Linus Torvalds 已提交
767 768 769

void __init mem_init(void)
{
770
	long codesize, reservedpages, datasize, initsize;
L
Linus Torvalds 已提交
771

772
	pci_iommu_alloc();
L
Linus Torvalds 已提交
773

774
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
775 776 777 778

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
779
#ifdef CONFIG_NUMA
780
	totalram_pages = numa_free_all_bootmem();
L
Linus Torvalds 已提交
781
#else
782
	totalram_pages = free_all_bootmem();
L
Linus Torvalds 已提交
783
#endif
Y
Yinghai Lu 已提交
784 785
	reservedpages = max_pfn - totalram_pages -
					absent_pages_in_range(0, max_pfn);
L
Linus Torvalds 已提交
786 787 788 789 790 791 792
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
793 794
	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
L
Linus Torvalds 已提交
795 796 797
		   VMALLOC_END-VMALLOC_START);
	kclist_add(&kcore_kernel, &_stext, _end - _stext);
	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
T
Thomas Gleixner 已提交
798
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
L
Linus Torvalds 已提交
799 800
				 VSYSCALL_END - VSYSCALL_START);

801
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
T
Thomas Gleixner 已提交
802
				"%ldk reserved, %ldk data, %ldk init)\n",
L
Linus Torvalds 已提交
803
		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
Y
Yinghai Lu 已提交
804
		max_pfn << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
805 806 807 808
		codesize >> 10,
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
809 810

	cpa_init();
L
Linus Torvalds 已提交
811 812
}

813
void free_init_pages(char *what, unsigned long begin, unsigned long end)
L
Linus Torvalds 已提交
814
{
815
	unsigned long addr = begin;
L
Linus Torvalds 已提交
816

817
	if (addr >= end)
818 819
		return;

I
Ingo Molnar 已提交
820 821 822 823 824 825 826 827 828 829
	/*
	 * If debugging page accesses then do not free this memory but
	 * mark them not present - any buggy init-section access will
	 * create a kernel page fault:
	 */
#ifdef CONFIG_DEBUG_PAGEALLOC
	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
		begin, PAGE_ALIGN(end));
	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
#else
830
	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
T
Thomas Gleixner 已提交
831

832
	for (; addr < end; addr += PAGE_SIZE) {
833 834 835 836 837
		ClearPageReserved(virt_to_page(addr));
		init_page_count(virt_to_page(addr));
		memset((void *)(addr & ~(PAGE_SIZE-1)),
			POISON_FREE_INITMEM, PAGE_SIZE);
		free_page(addr);
L
Linus Torvalds 已提交
838 839
		totalram_pages++;
	}
I
Ingo Molnar 已提交
840
#endif
841 842 843 844 845
}

void free_initmem(void)
{
	free_init_pages("unused kernel memory",
846 847
			(unsigned long)(&__init_begin),
			(unsigned long)(&__init_end));
L
Linus Torvalds 已提交
848 849
}

850
#ifdef CONFIG_DEBUG_RODATA
851 852
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
853 854 855

void mark_rodata_ro(void)
{
856
	unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
857 858 859 860 861 862 863
	unsigned long rodata_start =
		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;

#ifdef CONFIG_DYNAMIC_FTRACE
	/* Dynamic tracing modifies the kernel text section */
	start = rodata_start;
#endif
864

865
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
866
	       (end - start) >> 10);
867 868 869 870 871 872
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
873
	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
874

875 876
	rodata_test();

877
#ifdef CONFIG_CPA_DEBUG
878
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
879
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
880

881
	printk(KERN_INFO "Testing CPA: again\n");
882
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
883
#endif
884
}
885

886 887
#endif

L
Linus Torvalds 已提交
888 889 890
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
891
	free_init_pages("initrd memory", start, end);
L
Linus Torvalds 已提交
892 893 894
}
#endif

895 896
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
				   int flags)
T
Thomas Gleixner 已提交
897
{
898
#ifdef CONFIG_NUMA
899
	int nid, next_nid;
900
	int ret;
901 902
#endif
	unsigned long pfn = phys >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
903

Y
Yinghai Lu 已提交
904
	if (pfn >= max_pfn) {
T
Thomas Gleixner 已提交
905 906 907 908
		/*
		 * This can happen with kdump kernels when accessing
		 * firmware tables:
		 */
909
		if (pfn < max_pfn_mapped)
910
			return -EFAULT;
T
Thomas Gleixner 已提交
911

912
		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
913
				phys, len);
914
		return -EFAULT;
915 916 917 918
	}

	/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
919 920 921
	nid = phys_to_nid(phys);
	next_nid = phys_to_nid(phys + len - 1);
	if (nid == next_nid)
922
		ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
923
	else
924 925 926 927 928
		ret = reserve_bootmem(phys, len, flags);

	if (ret != 0)
		return ret;

T
Thomas Gleixner 已提交
929
#else
930
	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
L
Linus Torvalds 已提交
931
#endif
932

933
	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
934
		dma_reserve += len / PAGE_SIZE;
935 936
		set_dma_reserve(dma_reserve);
	}
937 938

	return 0;
L
Linus Torvalds 已提交
939 940
}

T
Thomas Gleixner 已提交
941 942
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
943
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
944 945 946 947
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
948 949

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
950 951
		return 0;

L
Linus Torvalds 已提交
952 953 954 955 956 957
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
958
		return 0;
L
Linus Torvalds 已提交
959 960 961 962

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
963

L
Linus Torvalds 已提交
964 965 966 967 968 969
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
970

L
Linus Torvalds 已提交
971 972 973
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
974 975 976 977 978
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
979
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
980 981 982 983
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
984 985 986 987 988
};

struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
#ifdef CONFIG_IA32_EMULATION
989 990
	if (test_tsk_thread_flag(tsk, TIF_IA32))
		return NULL;
L
Linus Torvalds 已提交
991 992 993 994 995 996 997
#endif
	return &gate_vma;
}

int in_gate_area(struct task_struct *task, unsigned long addr)
{
	struct vm_area_struct *vma = get_gate_vma(task);
T
Thomas Gleixner 已提交
998

999 1000
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
1001

L
Linus Torvalds 已提交
1002 1003 1004
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
1005 1006 1007 1008
/*
 * Use this when you have no reliable task/vma, typically from interrupt
 * context. It is less reliable than using the task's vma and may give
 * false positives:
L
Linus Torvalds 已提交
1009 1010 1011
 */
int in_gate_area_no_task(unsigned long addr)
{
1012
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
1013
}
1014

1015 1016 1017 1018 1019 1020 1021 1022
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
1023 1024 1025 1026 1027

#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
1028 1029 1030 1031
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

T
Thomas Gleixner 已提交
1032 1033
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
1034 1035 1036 1037 1038 1039 1040 1041 1042
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
1043
		void *p = NULL;
1044 1045 1046 1047

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
1048

1049 1050 1051 1052
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

1053 1054 1055 1056 1057 1058 1059 1060
		if (!cpu_has_pse) {
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			pmd = vmemmap_pmd_populate(pud, addr, node);

			if (!pmd)
				return -ENOMEM;

			p = vmemmap_pte_populate(pmd, addr, node);
T
Thomas Gleixner 已提交
1061

1062 1063 1064
			if (!p)
				return -ENOMEM;

1065 1066
			addr_end = addr + PAGE_SIZE;
			p_end = p + PAGE_SIZE;
T
Thomas Gleixner 已提交
1067
		} else {
1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090
			next = pmd_addr_end(addr, end);

			pmd = pmd_offset(pud, addr);
			if (pmd_none(*pmd)) {
				pte_t entry;

				p = vmemmap_alloc_block(PMD_SIZE, node);
				if (!p)
					return -ENOMEM;

				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
						PAGE_KERNEL_LARGE);
				set_pmd(pmd, __pmd(pte_val(entry)));

				/* check to see if we have contiguous blocks */
				if (p_end != p || node_start != node) {
					if (p_start)
						printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						       addr_start, addr_end-1, p_start, p_end-1, node_start);
					addr_start = addr;
					node_start = node;
					p_start = p;
				}
Y
Yinghai Lu 已提交
1091 1092 1093

				addr_end = addr + PMD_SIZE;
				p_end = p + PMD_SIZE;
1094 1095
			} else
				vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
1096
		}
1097

1098 1099 1100
	}
	return 0;
}
1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111

void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
1112
#endif