init_64.c 24.5 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
T
Thomas Gleixner 已提交
21
#include <linux/initrd.h>
L
Linus Torvalds 已提交
22 23 24
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
25
#include <linux/pci.h>
26
#include <linux/pfn.h>
27
#include <linux/poison.h>
28
#include <linux/dma-mapping.h>
29 30
#include <linux/module.h>
#include <linux/memory_hotplug.h>
31
#include <linux/nmi.h>
L
Linus Torvalds 已提交
32 33

#include <asm/processor.h>
34
#include <asm/bios_ebda.h>
L
Linus Torvalds 已提交
35 36 37 38 39 40 41 42 43 44 45 46
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
47
#include <asm/sections.h>
48
#include <asm/kdebug.h>
49
#include <asm/numa.h>
50
#include <asm/cacheflush.h>
51
#include <asm/init.h>
52
#include <linux/bootmem.h>
L
Linus Torvalds 已提交
53

54 55
static unsigned long dma_reserve __initdata;

I
Ingo Molnar 已提交
56 57 58 59 60 61 62 63 64 65 66 67 68 69
static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
70 71 72 73 74 75
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

76
pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
77 78 79 80
EXPORT_SYMBOL_GPL(__supported_pte_mask);

int force_personality32;

I
Ingo Molnar 已提交
81 82 83 84 85 86 87 88
/*
 * noexec32=on|off
 * Control non executable heap for 32bit processes.
 * To control the stack too use noexec=off
 *
 * on	PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
 * off	PROT_READ implies PROT_EXEC
 */
89 90 91 92 93 94 95 96 97 98
static int __init nonx32_setup(char *str)
{
	if (!strcmp(str, "on"))
		force_personality32 &= ~READ_IMPLIES_EXEC;
	else if (!strcmp(str, "off"))
		force_personality32 |= READ_IMPLIES_EXEC;
	return 1;
}
__setup("noexec32=", nonx32_setup);

99 100 101 102 103
/*
 * NOTE: This function is marked __ref because it calls __init function
 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
 */
static __ref void *spp_getpage(void)
T
Thomas Gleixner 已提交
104
{
L
Linus Torvalds 已提交
105
	void *ptr;
T
Thomas Gleixner 已提交
106

L
Linus Torvalds 已提交
107
	if (after_bootmem)
108
		ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
L
Linus Torvalds 已提交
109 110
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
111 112 113 114 115

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
116

117
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
118

L
Linus Torvalds 已提交
119
	return ptr;
T
Thomas Gleixner 已提交
120
}
L
Linus Torvalds 已提交
121

122
static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
L
Linus Torvalds 已提交
123
{
124 125 126 127 128 129 130 131 132
	if (pgd_none(*pgd)) {
		pud_t *pud = (pud_t *)spp_getpage();
		pgd_populate(&init_mm, pgd, pud);
		if (pud != pud_offset(pgd, 0))
			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
			       pud, pud_offset(pgd, 0));
	}
	return pud_offset(pgd, vaddr);
}
L
Linus Torvalds 已提交
133

134
static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
135
{
L
Linus Torvalds 已提交
136
	if (pud_none(*pud)) {
137
		pmd_t *pmd = (pmd_t *) spp_getpage();
138
		pud_populate(&init_mm, pud, pmd);
139
		if (pmd != pmd_offset(pud, 0))
140
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
141
			       pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
142
	}
143 144 145
	return pmd_offset(pud, vaddr);
}

146
static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
147
{
L
Linus Torvalds 已提交
148
	if (pmd_none(*pmd)) {
149
		pte_t *pte = (pte_t *) spp_getpage();
150
		pmd_populate_kernel(&init_mm, pmd, pte);
151
		if (pte != pte_offset_kernel(pmd, 0))
152
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
153
	}
154 155 156 157 158 159 160 161 162 163 164 165
	return pte_offset_kernel(pmd, vaddr);
}

void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
{
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	pud = pud_page + pud_index(vaddr);
	pmd = fill_pmd(pud, vaddr);
	pte = fill_pte(pmd, vaddr);
L
Linus Torvalds 已提交
166 167 168 169 170 171 172 173 174 175

	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

176
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
{
	pgd_t *pgd;
	pud_t *pud_page;

	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
		return;
	}
	pud_page = (pud_t*)pgd_page_vaddr(*pgd);
	set_pte_vaddr_pud(pud_page, vaddr, pteval);
}

193
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
194 195 196 197 198
{
	pgd_t *pgd;
	pud_t *pud;

	pgd = pgd_offset_k(vaddr);
199 200 201 202 203 204 205
	pud = fill_pud(pgd, vaddr);
	return fill_pmd(pud, vaddr);
}

pte_t * __init populate_extra_pte(unsigned long vaddr)
{
	pmd_t *pmd;
206

207 208
	pmd = populate_extra_pmd(vaddr);
	return fill_pte(pmd, vaddr);
209 210
}

211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
/*
 * Create large page table mappings for a range of physical addresses.
 */
static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
						pgprot_t prot)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
		pgd = pgd_offset_k((unsigned long)__va(phys));
		if (pgd_none(*pgd)) {
			pud = (pud_t *) spp_getpage();
			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pud = pud_offset(pgd, (unsigned long)__va(phys));
		if (pud_none(*pud)) {
			pmd = (pmd_t *) spp_getpage();
			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pmd = pmd_offset(pud, phys);
		BUG_ON(!pmd_none(*pmd));
		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
	}
}

void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
}

void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
}

251
/*
252 253 254
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
255 256 257 258 259 260 261 262 263 264 265 266
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
 * We limit the mappings to the region from _text to _end.  _end is
 * rounded up to the 2MB boundary. This catches the invalid pmds as
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
267
	unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
268 269 270 271
	pmd_t *pmd = level2_kernel_pgt;
	pmd_t *last_pmd = pmd + PTRS_PER_PMD;

	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
272
		if (pmd_none(*pmd))
273 274 275 276 277 278
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

279
static __ref void *alloc_low_page(unsigned long *phys)
T
Thomas Gleixner 已提交
280
{
281
	unsigned long pfn = e820_table_end++;
L
Linus Torvalds 已提交
282 283
	void *adr;

284
	if (after_bootmem) {
285
		adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
286
		*phys = __pa(adr);
T
Thomas Gleixner 已提交
287

288 289 290
		return adr;
	}

291
	if (pfn >= e820_table_top)
T
Thomas Gleixner 已提交
292
		panic("alloc_low_page: ran out of memory");
293

J
Jeremy Fitzhardinge 已提交
294
	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
295
	memset(adr, 0, PAGE_SIZE);
296 297 298
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
L
Linus Torvalds 已提交
299

300
static __ref void unmap_low_page(void *adr)
T
Thomas Gleixner 已提交
301
{
302 303 304
	if (after_bootmem)
		return;

305
	early_iounmap(adr, PAGE_SIZE);
T
Thomas Gleixner 已提交
306
}
L
Linus Torvalds 已提交
307

308
static unsigned long __meminit
309 310
phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
	      pgprot_t prot)
311 312
{
	unsigned pages = 0;
313
	unsigned long last_map_addr = end;
314
	int i;
315

316 317 318 319 320 321 322 323 324 325 326 327
	pte_t *pte = pte_page + pte_index(addr);

	for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {

		if (addr >= end) {
			if (!after_bootmem) {
				for(; i < PTRS_PER_PTE; i++, pte++)
					set_pte(pte, __pte(0));
			}
			break;
		}

328 329 330 331 332 333
		/*
		 * We will re-use the existing mapping.
		 * Xen for example has some special requirements, like mapping
		 * pagetable pages as RO. So assume someone who pre-setup
		 * these mappings are more intelligent.
		 */
334 335
		if (pte_val(*pte)) {
			pages++;
336
			continue;
337
		}
338 339 340 341 342

		if (0)
			printk("   pte=%p addr=%lx pte=%016lx\n",
			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
		pages++;
343
		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
344
		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
345
	}
346

347
	update_page_count(PG_LEVEL_4K, pages);
348 349

	return last_map_addr;
350 351
}

352
static unsigned long __meminit
353 354
phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
		pgprot_t prot)
355 356 357
{
	pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);

358
	return phys_pte_init(pte, address, end, prot);
359 360
}

361
static unsigned long __meminit
362
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
363
	      unsigned long page_size_mask, pgprot_t prot)
364
{
365
	unsigned long pages = 0;
366
	unsigned long last_map_addr = end;
367

368
	int i = pmd_index(address);
369

370
	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
371
		unsigned long pte_phys;
372
		pmd_t *pmd = pmd_page + pmd_index(address);
373
		pte_t *pte;
374
		pgprot_t new_prot = prot;
375

376
		if (address >= end) {
T
Thomas Gleixner 已提交
377
			if (!after_bootmem) {
378 379
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
T
Thomas Gleixner 已提交
380
			}
381 382
			break;
		}
383

384
		if (pmd_val(*pmd)) {
385 386
			if (!pmd_large(*pmd)) {
				spin_lock(&init_mm.page_table_lock);
387
				last_map_addr = phys_pte_update(pmd, address,
388
								end, prot);
389
				spin_unlock(&init_mm.page_table_lock);
390
				continue;
391
			}
392 393 394 395 396 397 398 399 400 401 402 403
			/*
			 * If we are ok with PG_LEVEL_2M mapping, then we will
			 * use the existing mapping,
			 *
			 * Otherwise, we will split the large page mapping but
			 * use the same existing protection bits except for
			 * large page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
404 405
			if (page_size_mask & (1 << PG_LEVEL_2M)) {
				pages++;
406
				continue;
407
			}
408
			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
409 410
		}

411
		if (page_size_mask & (1<<PG_LEVEL_2M)) {
412
			pages++;
413
			spin_lock(&init_mm.page_table_lock);
414
			set_pte((pte_t *)pmd,
415 416
				pfn_pte(address >> PAGE_SHIFT,
					__pgprot(pgprot_val(prot) | _PAGE_PSE)));
417
			spin_unlock(&init_mm.page_table_lock);
418
			last_map_addr = (address & PMD_MASK) + PMD_SIZE;
419
			continue;
420
		}
421

422
		pte = alloc_low_page(&pte_phys);
423
		last_map_addr = phys_pte_init(pte, address, end, new_prot);
424 425
		unmap_low_page(pte);

426
		spin_lock(&init_mm.page_table_lock);
427
		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
428
		spin_unlock(&init_mm.page_table_lock);
429
	}
430
	update_page_count(PG_LEVEL_2M, pages);
431
	return last_map_addr;
432 433
}

434
static unsigned long __meminit
435
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
436
		unsigned long page_size_mask, pgprot_t prot)
437
{
T
Thomas Gleixner 已提交
438
	pmd_t *pmd = pmd_offset(pud, 0);
439 440
	unsigned long last_map_addr;

441
	last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
442
	__flush_tlb_all();
443
	return last_map_addr;
444 445
}

446
static unsigned long __meminit
447 448
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
			 unsigned long page_size_mask)
T
Thomas Gleixner 已提交
449
{
450
	unsigned long pages = 0;
451
	unsigned long last_map_addr = end;
452
	int i = pud_index(addr);
453

T
Thomas Gleixner 已提交
454
	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
455 456
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
457
		pmd_t *pmd;
458
		pgprot_t prot = PAGE_KERNEL;
L
Linus Torvalds 已提交
459

460
		if (addr >= end)
L
Linus Torvalds 已提交
461 462
			break;

T
Thomas Gleixner 已提交
463 464 465
		if (!after_bootmem &&
				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
			set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
466
			continue;
T
Thomas Gleixner 已提交
467
		}
L
Linus Torvalds 已提交
468

469
		if (pud_val(*pud)) {
470
			if (!pud_large(*pud)) {
471
				last_map_addr = phys_pmd_update(pud, addr, end,
472
							 page_size_mask, prot);
473 474
				continue;
			}
475 476 477 478 479 480 481 482 483 484 485 486
			/*
			 * If we are ok with PG_LEVEL_1G mapping, then we will
			 * use the existing mapping.
			 *
			 * Otherwise, we will split the gbpage mapping but use
			 * the same existing protection  bits except for large
			 * page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
487 488
			if (page_size_mask & (1 << PG_LEVEL_1G)) {
				pages++;
489
				continue;
490
			}
491
			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
492 493
		}

494
		if (page_size_mask & (1<<PG_LEVEL_1G)) {
495
			pages++;
496
			spin_lock(&init_mm.page_table_lock);
497 498
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
499
			spin_unlock(&init_mm.page_table_lock);
500
			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
501 502 503
			continue;
		}

504
		pmd = alloc_low_page(&pmd_phys);
505 506
		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
					      prot);
507
		unmap_low_page(pmd);
508 509

		spin_lock(&init_mm.page_table_lock);
510
		pud_populate(&init_mm, pud, __va(pmd_phys));
511
		spin_unlock(&init_mm.page_table_lock);
L
Linus Torvalds 已提交
512
	}
A
Andi Kleen 已提交
513
	__flush_tlb_all();
514

515
	update_page_count(PG_LEVEL_1G, pages);
516

517
	return last_map_addr;
T
Thomas Gleixner 已提交
518
}
L
Linus Torvalds 已提交
519

520
static unsigned long __meminit
521 522
phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
		 unsigned long page_size_mask)
523 524 525 526 527
{
	pud_t *pud;

	pud = (pud_t *)pgd_page_vaddr(*pgd);

528
	return phys_pud_init(pud, addr, end, page_size_mask);
529 530
}

531
unsigned long __meminit
532 533 534
kernel_physical_mapping_init(unsigned long start,
			     unsigned long end,
			     unsigned long page_size_mask)
T
Thomas Gleixner 已提交
535
{
L
Linus Torvalds 已提交
536

537
	unsigned long next, last_map_addr = end;
L
Linus Torvalds 已提交
538 539 540 541 542

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);

	for (; start < end; start = next) {
543
		pgd_t *pgd = pgd_offset_k(start);
T
Thomas Gleixner 已提交
544
		unsigned long pud_phys;
545 546
		pud_t *pud;

547
		next = (start + PGDIR_SIZE) & PGDIR_MASK;
548 549 550 551
		if (next > end)
			next = end;

		if (pgd_val(*pgd)) {
552 553
			last_map_addr = phys_pud_update(pgd, __pa(start),
						 __pa(end), page_size_mask);
554 555 556
			continue;
		}

557
		pud = alloc_low_page(&pud_phys);
558 559
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
						 page_size_mask);
560
		unmap_low_page(pud);
561 562 563 564

		spin_lock(&init_mm.page_table_lock);
		pgd_populate(&init_mm, pgd, __va(pud_phys));
		spin_unlock(&init_mm.page_table_lock);
T
Thomas Gleixner 已提交
565
	}
566
	__flush_tlb_all();
L
Linus Torvalds 已提交
567

568 569
	return last_map_addr;
}
570

571
#ifndef CONFIG_NUMA
D
David Rientjes 已提交
572 573
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
				int acpi, int k8)
574 575 576 577 578 579 580 581
{
	unsigned long bootmap_size, bootmap;

	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
				 PAGE_SIZE);
	if (bootmap == -1L)
		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
582
	reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
Y
Yinghai Lu 已提交
583 584 585
	/* don't touch min_low_pfn */
	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
					 0, end_pfn);
586 587 588
	e820_register_active_regions(0, start_pfn, end_pfn);
	free_bootmem_with_active_regions(0, end_pfn);
}
589
#endif
590

L
Linus Torvalds 已提交
591 592
void __init paging_init(void)
{
593
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
594

595 596 597
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
Y
Yinghai Lu 已提交
598
	max_zone_pfns[ZONE_NORMAL] = max_pfn;
599

600
	sparse_memory_present_with_active_regions(MAX_NUMNODES);
601
	sparse_init();
602 603 604 605 606 607 608 609 610

	/*
	 * clear the default setting with node 0
	 * note: don't use nodes_clear here, that is really clearing when
	 *	 numa support is not compiled in, and later node_set_state
	 *	 will not set it back.
	 */
	node_clear_state(0, N_NORMAL_MEMORY);

611
	free_area_init_nodes(max_zone_pfns);
L
Linus Torvalds 已提交
612 613
}

614 615 616
/*
 * Memory hotplug specific functions
 */
617
#ifdef CONFIG_MEMORY_HOTPLUG
618 619 620 621 622 623 624 625 626 627 628 629 630 631 632
/*
 * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
 * updating.
 */
static void  update_end_of_memory_vars(u64 start, u64 size)
{
	unsigned long end_pfn = PFN_UP(start + size);

	if (end_pfn > max_pfn) {
		max_pfn = end_pfn;
		max_low_pfn = end_pfn;
		high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
	}
}

633 634 635 636
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
637
int arch_add_memory(int nid, u64 start, u64 size)
638
{
639
	struct pglist_data *pgdat = NODE_DATA(nid);
640
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
641
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
642 643 644
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

645
	last_mapped_pfn = init_memory_mapping(start, start + size);
646 647
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
648

649
	ret = __add_pages(nid, zone, start_pfn, nr_pages);
650
	WARN_ON_ONCE(ret);
651

652 653 654
	/* update max_pfn, max_low_pfn and high_memory */
	update_end_of_memory_vars(start, size);

655 656
	return ret;
}
657
EXPORT_SYMBOL_GPL(arch_add_memory);
658

659
#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
660 661 662 663
int memory_add_physaddr_to_nid(u64 start)
{
	return 0;
}
664
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
665 666
#endif

667 668
#endif /* CONFIG_MEMORY_HOTPLUG */

669
static struct kcore_list kcore_vsyscall;
L
Linus Torvalds 已提交
670 671 672

void __init mem_init(void)
{
673
	long codesize, reservedpages, datasize, initsize;
674
	unsigned long absent_pages;
L
Linus Torvalds 已提交
675

676
	pci_iommu_alloc();
L
Linus Torvalds 已提交
677

678
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
679 680 681 682

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
683
#ifdef CONFIG_NUMA
684
	totalram_pages = numa_free_all_bootmem();
L
Linus Torvalds 已提交
685
#else
686
	totalram_pages = free_all_bootmem();
L
Linus Torvalds 已提交
687
#endif
688 689 690

	absent_pages = absent_pages_in_range(0, max_pfn);
	reservedpages = max_pfn - totalram_pages - absent_pages;
L
Linus Torvalds 已提交
691 692 693 694 695 696 697
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
698
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
K
KAMEZAWA Hiroyuki 已提交
699
			 VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
L
Linus Torvalds 已提交
700

701
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
702
			 "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
703
		nr_free_pages() << (PAGE_SHIFT-10),
Y
Yinghai Lu 已提交
704
		max_pfn << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
705
		codesize >> 10,
706
		absent_pages << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
707 708 709 710 711
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
}

712
#ifdef CONFIG_DEBUG_RODATA
713 714
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
715

716
int kernel_set_to_readonly;
717 718 719

void set_kernel_text_rw(void)
{
720
	unsigned long start = PFN_ALIGN(_text);
721
	unsigned long end = PFN_ALIGN(__stop___ex_table);
722 723 724 725 726 727 728

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read write\n",
		 start, end);

729 730 731 732 733
	/*
	 * Make the kernel identity mapping for text RW. Kernel text
	 * mapping will always be RO. Refer to the comment in
	 * static_protections() in pageattr.c
	 */
734 735 736 737 738
	set_memory_rw(start, (end - start) >> PAGE_SHIFT);
}

void set_kernel_text_ro(void)
{
739
	unsigned long start = PFN_ALIGN(_text);
740
	unsigned long end = PFN_ALIGN(__stop___ex_table);
741 742 743 744 745 746 747

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read only\n",
		 start, end);

748 749 750
	/*
	 * Set the kernel identity mapping for text RO.
	 */
751 752 753
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
}

754 755
void mark_rodata_ro(void)
{
756
	unsigned long start = PFN_ALIGN(_text);
757 758
	unsigned long rodata_start =
		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
759 760 761 762
	unsigned long end = (unsigned long) &__end_rodata_hpage_align;
	unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
	unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
	unsigned long data_start = (unsigned long) &_sdata;
763

764
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
765
	       (end - start) >> 10);
766 767
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

768 769
	kernel_set_to_readonly = 1;

770 771 772 773
	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
774
	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
775

776 777
	rodata_test();

778
#ifdef CONFIG_CPA_DEBUG
779
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
780
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
781

782
	printk(KERN_INFO "Testing CPA: again\n");
783
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
784
#endif
785 786 787 788 789 790 791 792

	free_init_pages("unused kernel memory",
			(unsigned long) page_address(virt_to_page(text_end)),
			(unsigned long)
				 page_address(virt_to_page(rodata_start)));
	free_init_pages("unused kernel memory",
			(unsigned long) page_address(virt_to_page(rodata_end)),
			(unsigned long) page_address(virt_to_page(data_start)));
793
}
794

795 796
#endif

797 798
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
				   int flags)
T
Thomas Gleixner 已提交
799
{
800
#ifdef CONFIG_NUMA
801
	int nid, next_nid;
802
	int ret;
803 804
#endif
	unsigned long pfn = phys >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
805

Y
Yinghai Lu 已提交
806
	if (pfn >= max_pfn) {
T
Thomas Gleixner 已提交
807 808 809 810
		/*
		 * This can happen with kdump kernels when accessing
		 * firmware tables:
		 */
811
		if (pfn < max_pfn_mapped)
812
			return -EFAULT;
T
Thomas Gleixner 已提交
813

814
		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
815
				phys, len);
816
		return -EFAULT;
817 818 819 820
	}

	/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
821 822 823
	nid = phys_to_nid(phys);
	next_nid = phys_to_nid(phys + len - 1);
	if (nid == next_nid)
824
		ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
825
	else
826 827 828 829 830
		ret = reserve_bootmem(phys, len, flags);

	if (ret != 0)
		return ret;

T
Thomas Gleixner 已提交
831
#else
832
	reserve_bootmem(phys, len, flags);
L
Linus Torvalds 已提交
833
#endif
834

835
	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
836
		dma_reserve += len / PAGE_SIZE;
837 838
		set_dma_reserve(dma_reserve);
	}
839 840

	return 0;
L
Linus Torvalds 已提交
841 842
}

T
Thomas Gleixner 已提交
843 844
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
845
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
846 847 848 849
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
850 851

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
852 853
		return 0;

L
Linus Torvalds 已提交
854 855 856 857 858 859
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
860
		return 0;
L
Linus Torvalds 已提交
861 862 863 864

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
865

L
Linus Torvalds 已提交
866 867 868 869 870 871
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
872

L
Linus Torvalds 已提交
873 874 875
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
876 877 878 879 880
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
881
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
882 883 884 885
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
886 887 888 889 890
};

struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
#ifdef CONFIG_IA32_EMULATION
891 892
	if (test_tsk_thread_flag(tsk, TIF_IA32))
		return NULL;
L
Linus Torvalds 已提交
893 894 895 896 897 898 899
#endif
	return &gate_vma;
}

int in_gate_area(struct task_struct *task, unsigned long addr)
{
	struct vm_area_struct *vma = get_gate_vma(task);
T
Thomas Gleixner 已提交
900

901 902
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
903

L
Linus Torvalds 已提交
904 905 906
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
907 908 909 910
/*
 * Use this when you have no reliable task/vma, typically from interrupt
 * context. It is less reliable than using the task's vma and may give
 * false positives:
L
Linus Torvalds 已提交
911 912 913
 */
int in_gate_area_no_task(unsigned long addr)
{
914
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
915
}
916

917 918 919 920 921 922 923 924
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
925 926 927 928 929

#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
930 931 932 933
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

T
Thomas Gleixner 已提交
934 935
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
936 937 938 939 940 941 942 943 944
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
945
		void *p = NULL;
946 947 948 949

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
950

951 952 953 954
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

955 956 957 958 959 960 961 962
		if (!cpu_has_pse) {
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			pmd = vmemmap_pmd_populate(pud, addr, node);

			if (!pmd)
				return -ENOMEM;

			p = vmemmap_pte_populate(pmd, addr, node);
T
Thomas Gleixner 已提交
963

964 965 966
			if (!p)
				return -ENOMEM;

967 968
			addr_end = addr + PAGE_SIZE;
			p_end = p + PAGE_SIZE;
T
Thomas Gleixner 已提交
969
		} else {
970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992
			next = pmd_addr_end(addr, end);

			pmd = pmd_offset(pud, addr);
			if (pmd_none(*pmd)) {
				pte_t entry;

				p = vmemmap_alloc_block(PMD_SIZE, node);
				if (!p)
					return -ENOMEM;

				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
						PAGE_KERNEL_LARGE);
				set_pmd(pmd, __pmd(pte_val(entry)));

				/* check to see if we have contiguous blocks */
				if (p_end != p || node_start != node) {
					if (p_start)
						printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						       addr_start, addr_end-1, p_start, p_end-1, node_start);
					addr_start = addr;
					node_start = node;
					p_start = p;
				}
Y
Yinghai Lu 已提交
993 994 995

				addr_end = addr + PMD_SIZE;
				p_end = p + PMD_SIZE;
996 997
			} else
				vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
998
		}
999

1000 1001 1002
	}
	return 0;
}
1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013

void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
1014
#endif