init_64.c 23.7 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
T
Thomas Gleixner 已提交
21
#include <linux/initrd.h>
L
Linus Torvalds 已提交
22 23 24
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
25
#include <linux/pci.h>
26
#include <linux/pfn.h>
27
#include <linux/poison.h>
28
#include <linux/dma-mapping.h>
29 30
#include <linux/module.h>
#include <linux/memory_hotplug.h>
31
#include <linux/nmi.h>
L
Linus Torvalds 已提交
32 33

#include <asm/processor.h>
34
#include <asm/bios_ebda.h>
L
Linus Torvalds 已提交
35 36 37 38 39 40 41 42 43 44 45 46
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
47
#include <asm/sections.h>
48
#include <asm/kdebug.h>
49
#include <asm/numa.h>
50
#include <asm/cacheflush.h>
51
#include <asm/init.h>
L
Linus Torvalds 已提交
52

53 54 55 56 57
/*
 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
 * The direct mapping extends to max_pfn_mapped, so that we can directly access
 * apertures, ACPI and other tables without having to play with fixmaps.
 */
58
unsigned long max_low_pfn_mapped;
59 60
unsigned long max_pfn_mapped;

61 62
static unsigned long dma_reserve __initdata;

L
Linus Torvalds 已提交
63 64
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);

I
Ingo Molnar 已提交
65 66 67 68 69 70 71 72 73 74 75 76 77 78
static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
79 80 81 82 83 84
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

85
pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
86 87
EXPORT_SYMBOL_GPL(__supported_pte_mask);

88
static int disable_nx __cpuinitdata;
89

I
Ingo Molnar 已提交
90 91 92 93 94 95 96
/*
 * noexec=on|off
 * Control non-executable mappings for 64-bit processes.
 *
 * on	Enable (default)
 * off	Disable
 */
97 98 99 100 101 102
static int __init nonx_setup(char *str)
{
	if (!str)
		return -EINVAL;
	if (!strncmp(str, "on", 2)) {
		__supported_pte_mask |= _PAGE_NX;
103
		disable_nx = 0;
104
	} else if (!strncmp(str, "off", 3)) {
105
		disable_nx = 1;
106 107 108 109 110 111 112 113 114 115 116
		__supported_pte_mask &= ~_PAGE_NX;
	}
	return 0;
}
early_param("noexec", nonx_setup);

void __cpuinit check_efer(void)
{
	unsigned long efer;

	rdmsrl(MSR_EFER, efer);
117
	if (!(efer & EFER_NX) || disable_nx)
118 119 120 121 122
		__supported_pte_mask &= ~_PAGE_NX;
}

int force_personality32;

I
Ingo Molnar 已提交
123 124 125 126 127 128 129 130
/*
 * noexec32=on|off
 * Control non executable heap for 32bit processes.
 * To control the stack too use noexec=off
 *
 * on	PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
 * off	PROT_READ implies PROT_EXEC
 */
131 132 133 134 135 136 137 138 139 140
static int __init nonx32_setup(char *str)
{
	if (!strcmp(str, "on"))
		force_personality32 &= ~READ_IMPLIES_EXEC;
	else if (!strcmp(str, "off"))
		force_personality32 |= READ_IMPLIES_EXEC;
	return 1;
}
__setup("noexec32=", nonx32_setup);

141 142 143 144 145
/*
 * NOTE: This function is marked __ref because it calls __init function
 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
 */
static __ref void *spp_getpage(void)
T
Thomas Gleixner 已提交
146
{
L
Linus Torvalds 已提交
147
	void *ptr;
T
Thomas Gleixner 已提交
148

L
Linus Torvalds 已提交
149
	if (after_bootmem)
T
Thomas Gleixner 已提交
150
		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
L
Linus Torvalds 已提交
151 152
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
153 154 155 156 157

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
158

159
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
160

L
Linus Torvalds 已提交
161
	return ptr;
T
Thomas Gleixner 已提交
162
}
L
Linus Torvalds 已提交
163

164
static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
L
Linus Torvalds 已提交
165
{
166 167 168 169 170 171 172 173 174
	if (pgd_none(*pgd)) {
		pud_t *pud = (pud_t *)spp_getpage();
		pgd_populate(&init_mm, pgd, pud);
		if (pud != pud_offset(pgd, 0))
			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
			       pud, pud_offset(pgd, 0));
	}
	return pud_offset(pgd, vaddr);
}
L
Linus Torvalds 已提交
175

176
static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
177
{
L
Linus Torvalds 已提交
178
	if (pud_none(*pud)) {
179
		pmd_t *pmd = (pmd_t *) spp_getpage();
180
		pud_populate(&init_mm, pud, pmd);
181
		if (pmd != pmd_offset(pud, 0))
182
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
183
			       pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
184
	}
185 186 187
	return pmd_offset(pud, vaddr);
}

188
static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
189
{
L
Linus Torvalds 已提交
190
	if (pmd_none(*pmd)) {
191
		pte_t *pte = (pte_t *) spp_getpage();
192
		pmd_populate_kernel(&init_mm, pmd, pte);
193
		if (pte != pte_offset_kernel(pmd, 0))
194
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
195
	}
196 197 198 199 200 201 202 203 204 205 206 207
	return pte_offset_kernel(pmd, vaddr);
}

void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
{
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	pud = pud_page + pud_index(vaddr);
	pmd = fill_pmd(pud, vaddr);
	pte = fill_pte(pmd, vaddr);
L
Linus Torvalds 已提交
208 209 210 211 212 213 214 215 216 217

	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

218
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
{
	pgd_t *pgd;
	pud_t *pud_page;

	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
		return;
	}
	pud_page = (pud_t*)pgd_page_vaddr(*pgd);
	set_pte_vaddr_pud(pud_page, vaddr, pteval);
}

235
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
236 237 238 239 240
{
	pgd_t *pgd;
	pud_t *pud;

	pgd = pgd_offset_k(vaddr);
241 242 243 244 245 246 247
	pud = fill_pud(pgd, vaddr);
	return fill_pmd(pud, vaddr);
}

pte_t * __init populate_extra_pte(unsigned long vaddr)
{
	pmd_t *pmd;
248

249 250
	pmd = populate_extra_pmd(vaddr);
	return fill_pte(pmd, vaddr);
251 252
}

253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292
/*
 * Create large page table mappings for a range of physical addresses.
 */
static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
						pgprot_t prot)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
		pgd = pgd_offset_k((unsigned long)__va(phys));
		if (pgd_none(*pgd)) {
			pud = (pud_t *) spp_getpage();
			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pud = pud_offset(pgd, (unsigned long)__va(phys));
		if (pud_none(*pud)) {
			pmd = (pmd_t *) spp_getpage();
			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pmd = pmd_offset(pud, phys);
		BUG_ON(!pmd_none(*pmd));
		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
	}
}

void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
}

void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
}

293
/*
294 295 296
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
297 298 299 300 301 302 303 304 305 306 307 308
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
 * We limit the mappings to the region from _text to _end.  _end is
 * rounded up to the 2MB boundary. This catches the invalid pmds as
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
309
	unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
310 311 312 313
	pmd_t *pmd = level2_kernel_pgt;
	pmd_t *last_pmd = pmd + PTRS_PER_PMD;

	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
314
		if (pmd_none(*pmd))
315 316 317 318 319 320
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

321
static __ref void *alloc_low_page(unsigned long *phys)
T
Thomas Gleixner 已提交
322
{
323
	unsigned long pfn = e820_table_end++;
L
Linus Torvalds 已提交
324 325
	void *adr;

326 327 328
	if (after_bootmem) {
		adr = (void *)get_zeroed_page(GFP_ATOMIC);
		*phys = __pa(adr);
T
Thomas Gleixner 已提交
329

330 331 332
		return adr;
	}

333
	if (pfn >= e820_table_top)
T
Thomas Gleixner 已提交
334
		panic("alloc_low_page: ran out of memory");
335

J
Jeremy Fitzhardinge 已提交
336
	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
337
	memset(adr, 0, PAGE_SIZE);
338 339 340
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
L
Linus Torvalds 已提交
341

342
static __ref void unmap_low_page(void *adr)
T
Thomas Gleixner 已提交
343
{
344 345 346
	if (after_bootmem)
		return;

347
	early_iounmap(adr, PAGE_SIZE);
T
Thomas Gleixner 已提交
348
}
L
Linus Torvalds 已提交
349

350
static unsigned long __meminit
351 352
phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
	      pgprot_t prot)
353 354
{
	unsigned pages = 0;
355
	unsigned long last_map_addr = end;
356
	int i;
357

358 359 360 361 362 363 364 365 366 367 368 369
	pte_t *pte = pte_page + pte_index(addr);

	for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {

		if (addr >= end) {
			if (!after_bootmem) {
				for(; i < PTRS_PER_PTE; i++, pte++)
					set_pte(pte, __pte(0));
			}
			break;
		}

370 371 372 373 374 375
		/*
		 * We will re-use the existing mapping.
		 * Xen for example has some special requirements, like mapping
		 * pagetable pages as RO. So assume someone who pre-setup
		 * these mappings are more intelligent.
		 */
376 377
		if (pte_val(*pte)) {
			pages++;
378
			continue;
379
		}
380 381 382 383 384

		if (0)
			printk("   pte=%p addr=%lx pte=%016lx\n",
			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
		pages++;
385
		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
386
		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
387
	}
388

389
	update_page_count(PG_LEVEL_4K, pages);
390 391

	return last_map_addr;
392 393
}

394
static unsigned long __meminit
395 396
phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
		pgprot_t prot)
397 398 399
{
	pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);

400
	return phys_pte_init(pte, address, end, prot);
401 402
}

403
static unsigned long __meminit
404
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
405
	      unsigned long page_size_mask, pgprot_t prot)
406
{
407
	unsigned long pages = 0;
408
	unsigned long last_map_addr = end;
409

410
	int i = pmd_index(address);
411

412
	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
413
		unsigned long pte_phys;
414
		pmd_t *pmd = pmd_page + pmd_index(address);
415
		pte_t *pte;
416
		pgprot_t new_prot = prot;
417

418
		if (address >= end) {
T
Thomas Gleixner 已提交
419
			if (!after_bootmem) {
420 421
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
T
Thomas Gleixner 已提交
422
			}
423 424
			break;
		}
425

426
		if (pmd_val(*pmd)) {
427 428
			if (!pmd_large(*pmd)) {
				spin_lock(&init_mm.page_table_lock);
429
				last_map_addr = phys_pte_update(pmd, address,
430
								end, prot);
431
				spin_unlock(&init_mm.page_table_lock);
432
				continue;
433
			}
434 435 436 437 438 439 440 441 442 443 444 445
			/*
			 * If we are ok with PG_LEVEL_2M mapping, then we will
			 * use the existing mapping,
			 *
			 * Otherwise, we will split the large page mapping but
			 * use the same existing protection bits except for
			 * large page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
446 447
			if (page_size_mask & (1 << PG_LEVEL_2M)) {
				pages++;
448
				continue;
449
			}
450
			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
451 452
		}

453
		if (page_size_mask & (1<<PG_LEVEL_2M)) {
454
			pages++;
455
			spin_lock(&init_mm.page_table_lock);
456
			set_pte((pte_t *)pmd,
457 458
				pfn_pte(address >> PAGE_SHIFT,
					__pgprot(pgprot_val(prot) | _PAGE_PSE)));
459
			spin_unlock(&init_mm.page_table_lock);
460
			last_map_addr = (address & PMD_MASK) + PMD_SIZE;
461
			continue;
462
		}
463

464
		pte = alloc_low_page(&pte_phys);
465
		last_map_addr = phys_pte_init(pte, address, end, new_prot);
466 467
		unmap_low_page(pte);

468
		spin_lock(&init_mm.page_table_lock);
469
		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
470
		spin_unlock(&init_mm.page_table_lock);
471
	}
472
	update_page_count(PG_LEVEL_2M, pages);
473
	return last_map_addr;
474 475
}

476
static unsigned long __meminit
477
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
478
		unsigned long page_size_mask, pgprot_t prot)
479
{
T
Thomas Gleixner 已提交
480
	pmd_t *pmd = pmd_offset(pud, 0);
481 482
	unsigned long last_map_addr;

483
	last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
484
	__flush_tlb_all();
485
	return last_map_addr;
486 487
}

488
static unsigned long __meminit
489 490
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
			 unsigned long page_size_mask)
T
Thomas Gleixner 已提交
491
{
492
	unsigned long pages = 0;
493
	unsigned long last_map_addr = end;
494
	int i = pud_index(addr);
495

T
Thomas Gleixner 已提交
496
	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
497 498
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
499
		pmd_t *pmd;
500
		pgprot_t prot = PAGE_KERNEL;
L
Linus Torvalds 已提交
501

502
		if (addr >= end)
L
Linus Torvalds 已提交
503 504
			break;

T
Thomas Gleixner 已提交
505 506 507
		if (!after_bootmem &&
				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
			set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
508
			continue;
T
Thomas Gleixner 已提交
509
		}
L
Linus Torvalds 已提交
510

511
		if (pud_val(*pud)) {
512
			if (!pud_large(*pud)) {
513
				last_map_addr = phys_pmd_update(pud, addr, end,
514
							 page_size_mask, prot);
515 516
				continue;
			}
517 518 519 520 521 522 523 524 525 526 527 528
			/*
			 * If we are ok with PG_LEVEL_1G mapping, then we will
			 * use the existing mapping.
			 *
			 * Otherwise, we will split the gbpage mapping but use
			 * the same existing protection  bits except for large
			 * page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
529 530
			if (page_size_mask & (1 << PG_LEVEL_1G)) {
				pages++;
531
				continue;
532
			}
533
			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
534 535
		}

536
		if (page_size_mask & (1<<PG_LEVEL_1G)) {
537
			pages++;
538
			spin_lock(&init_mm.page_table_lock);
539 540
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
541
			spin_unlock(&init_mm.page_table_lock);
542
			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
543 544 545
			continue;
		}

546
		pmd = alloc_low_page(&pmd_phys);
547 548
		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
					      prot);
549
		unmap_low_page(pmd);
550 551

		spin_lock(&init_mm.page_table_lock);
552
		pud_populate(&init_mm, pud, __va(pmd_phys));
553
		spin_unlock(&init_mm.page_table_lock);
L
Linus Torvalds 已提交
554
	}
A
Andi Kleen 已提交
555
	__flush_tlb_all();
556

557
	update_page_count(PG_LEVEL_1G, pages);
558

559
	return last_map_addr;
T
Thomas Gleixner 已提交
560
}
L
Linus Torvalds 已提交
561

562
static unsigned long __meminit
563 564
phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
		 unsigned long page_size_mask)
565 566 567 568 569
{
	pud_t *pud;

	pud = (pud_t *)pgd_page_vaddr(*pgd);

570
	return phys_pud_init(pud, addr, end, page_size_mask);
571 572
}

573
unsigned long __init
574 575 576
kernel_physical_mapping_init(unsigned long start,
			     unsigned long end,
			     unsigned long page_size_mask)
T
Thomas Gleixner 已提交
577
{
L
Linus Torvalds 已提交
578

579
	unsigned long next, last_map_addr = end;
L
Linus Torvalds 已提交
580 581 582 583 584

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);

	for (; start < end; start = next) {
585
		pgd_t *pgd = pgd_offset_k(start);
T
Thomas Gleixner 已提交
586
		unsigned long pud_phys;
587 588
		pud_t *pud;

589
		next = (start + PGDIR_SIZE) & PGDIR_MASK;
590 591 592 593
		if (next > end)
			next = end;

		if (pgd_val(*pgd)) {
594 595
			last_map_addr = phys_pud_update(pgd, __pa(start),
						 __pa(end), page_size_mask);
596 597 598
			continue;
		}

599
		pud = alloc_low_page(&pud_phys);
600 601
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
						 page_size_mask);
602
		unmap_low_page(pud);
603 604 605 606

		spin_lock(&init_mm.page_table_lock);
		pgd_populate(&init_mm, pgd, __va(pud_phys));
		spin_unlock(&init_mm.page_table_lock);
T
Thomas Gleixner 已提交
607
	}
608
	__flush_tlb_all();
L
Linus Torvalds 已提交
609

610 611
	return last_map_addr;
}
612

613
#ifndef CONFIG_NUMA
614 615 616 617 618 619 620 621 622
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long bootmap_size, bootmap;

	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
				 PAGE_SIZE);
	if (bootmap == -1L)
		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
Y
Yinghai Lu 已提交
623 624 625
	/* don't touch min_low_pfn */
	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
					 0, end_pfn);
626 627 628 629 630 631
	e820_register_active_regions(0, start_pfn, end_pfn);
	free_bootmem_with_active_regions(0, end_pfn);
	early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
	reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
}

L
Linus Torvalds 已提交
632 633
void __init paging_init(void)
{
634
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
635

636 637 638
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
Y
Yinghai Lu 已提交
639
	max_zone_pfns[ZONE_NORMAL] = max_pfn;
640

Y
Yinghai Lu 已提交
641
	memory_present(0, 0, max_pfn);
642
	sparse_init();
643
	free_area_init_nodes(max_zone_pfns);
L
Linus Torvalds 已提交
644 645 646
}
#endif

647 648 649
/*
 * Memory hotplug specific functions
 */
650
#ifdef CONFIG_MEMORY_HOTPLUG
651 652 653 654
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
655
int arch_add_memory(int nid, u64 start, u64 size)
656
{
657
	struct pglist_data *pgdat = NODE_DATA(nid);
658
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
659
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
660 661 662
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

663
	last_mapped_pfn = init_memory_mapping(start, start + size);
664 665
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
666

667
	ret = __add_pages(nid, zone, start_pfn, nr_pages);
668
	WARN_ON_ONCE(ret);
669 670 671

	return ret;
}
672
EXPORT_SYMBOL_GPL(arch_add_memory);
673

674
#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
675 676 677 678
int memory_add_physaddr_to_nid(u64 start)
{
	return 0;
}
679
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
680 681
#endif

682 683
#endif /* CONFIG_MEMORY_HOTPLUG */

T
Thomas Gleixner 已提交
684 685
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
			 kcore_modules, kcore_vsyscall;
L
Linus Torvalds 已提交
686 687 688

void __init mem_init(void)
{
689
	long codesize, reservedpages, datasize, initsize;
690
	unsigned long absent_pages;
L
Linus Torvalds 已提交
691

692
	pci_iommu_alloc();
L
Linus Torvalds 已提交
693

694
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
695 696 697 698

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
699
#ifdef CONFIG_NUMA
700
	totalram_pages = numa_free_all_bootmem();
L
Linus Torvalds 已提交
701
#else
702
	totalram_pages = free_all_bootmem();
L
Linus Torvalds 已提交
703
#endif
704 705 706

	absent_pages = absent_pages_in_range(0, max_pfn);
	reservedpages = max_pfn - totalram_pages - absent_pages;
L
Linus Torvalds 已提交
707 708 709 710 711 712 713
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
714 715
	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
L
Linus Torvalds 已提交
716 717 718
		   VMALLOC_END-VMALLOC_START);
	kclist_add(&kcore_kernel, &_stext, _end - _stext);
	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
T
Thomas Gleixner 已提交
719
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
L
Linus Torvalds 已提交
720 721
				 VSYSCALL_END - VSYSCALL_START);

722
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
723
			 "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
L
Linus Torvalds 已提交
724
		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
Y
Yinghai Lu 已提交
725
		max_pfn << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
726
		codesize >> 10,
727
		absent_pages << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
728 729 730 731 732
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
}

733
#ifdef CONFIG_DEBUG_RODATA
734 735
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
736 737 738

void mark_rodata_ro(void)
{
739
	unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
740 741 742 743 744 745 746
	unsigned long rodata_start =
		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;

#ifdef CONFIG_DYNAMIC_FTRACE
	/* Dynamic tracing modifies the kernel text section */
	start = rodata_start;
#endif
747

748
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
749
	       (end - start) >> 10);
750 751 752 753 754 755
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
756
	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
757

758 759
	rodata_test();

760
#ifdef CONFIG_CPA_DEBUG
761
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
762
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
763

764
	printk(KERN_INFO "Testing CPA: again\n");
765
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
766
#endif
767
}
768

769 770
#endif

771 772
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
				   int flags)
T
Thomas Gleixner 已提交
773
{
774
#ifdef CONFIG_NUMA
775
	int nid, next_nid;
776
	int ret;
777 778
#endif
	unsigned long pfn = phys >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
779

Y
Yinghai Lu 已提交
780
	if (pfn >= max_pfn) {
T
Thomas Gleixner 已提交
781 782 783 784
		/*
		 * This can happen with kdump kernels when accessing
		 * firmware tables:
		 */
785
		if (pfn < max_pfn_mapped)
786
			return -EFAULT;
T
Thomas Gleixner 已提交
787

788
		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
789
				phys, len);
790
		return -EFAULT;
791 792 793 794
	}

	/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
795 796 797
	nid = phys_to_nid(phys);
	next_nid = phys_to_nid(phys + len - 1);
	if (nid == next_nid)
798
		ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
799
	else
800 801 802 803 804
		ret = reserve_bootmem(phys, len, flags);

	if (ret != 0)
		return ret;

T
Thomas Gleixner 已提交
805
#else
806
	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
L
Linus Torvalds 已提交
807
#endif
808

809
	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
810
		dma_reserve += len / PAGE_SIZE;
811 812
		set_dma_reserve(dma_reserve);
	}
813 814

	return 0;
L
Linus Torvalds 已提交
815 816
}

T
Thomas Gleixner 已提交
817 818
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
819
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
820 821 822 823
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
824 825

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
826 827
		return 0;

L
Linus Torvalds 已提交
828 829 830 831 832 833
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
834
		return 0;
L
Linus Torvalds 已提交
835 836 837 838

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
839

L
Linus Torvalds 已提交
840 841 842 843 844 845
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
846

L
Linus Torvalds 已提交
847 848 849
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
850 851 852 853 854
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
855
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
856 857 858 859
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
860 861 862 863 864
};

struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
#ifdef CONFIG_IA32_EMULATION
865 866
	if (test_tsk_thread_flag(tsk, TIF_IA32))
		return NULL;
L
Linus Torvalds 已提交
867 868 869 870 871 872 873
#endif
	return &gate_vma;
}

int in_gate_area(struct task_struct *task, unsigned long addr)
{
	struct vm_area_struct *vma = get_gate_vma(task);
T
Thomas Gleixner 已提交
874

875 876
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
877

L
Linus Torvalds 已提交
878 879 880
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
881 882 883 884
/*
 * Use this when you have no reliable task/vma, typically from interrupt
 * context. It is less reliable than using the task's vma and may give
 * false positives:
L
Linus Torvalds 已提交
885 886 887
 */
int in_gate_area_no_task(unsigned long addr)
{
888
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
889
}
890

891 892 893 894 895 896 897 898
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
899 900 901 902 903

#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
904 905 906 907
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

T
Thomas Gleixner 已提交
908 909
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
910 911 912 913 914 915 916 917 918
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
919
		void *p = NULL;
920 921 922 923

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
924

925 926 927 928
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

929 930 931 932 933 934 935 936
		if (!cpu_has_pse) {
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			pmd = vmemmap_pmd_populate(pud, addr, node);

			if (!pmd)
				return -ENOMEM;

			p = vmemmap_pte_populate(pmd, addr, node);
T
Thomas Gleixner 已提交
937

938 939 940
			if (!p)
				return -ENOMEM;

941 942
			addr_end = addr + PAGE_SIZE;
			p_end = p + PAGE_SIZE;
T
Thomas Gleixner 已提交
943
		} else {
944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966
			next = pmd_addr_end(addr, end);

			pmd = pmd_offset(pud, addr);
			if (pmd_none(*pmd)) {
				pte_t entry;

				p = vmemmap_alloc_block(PMD_SIZE, node);
				if (!p)
					return -ENOMEM;

				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
						PAGE_KERNEL_LARGE);
				set_pmd(pmd, __pmd(pte_val(entry)));

				/* check to see if we have contiguous blocks */
				if (p_end != p || node_start != node) {
					if (p_start)
						printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						       addr_start, addr_end-1, p_start, p_end-1, node_start);
					addr_start = addr;
					node_start = node;
					p_start = p;
				}
Y
Yinghai Lu 已提交
967 968 969

				addr_end = addr + PMD_SIZE;
				p_end = p + PMD_SIZE;
970 971
			} else
				vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
972
		}
973

974 975 976
	}
	return 0;
}
977 978 979 980 981 982 983 984 985 986 987

void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
988
#endif