init_64.c 24.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
P
Pavel Machek 已提交
5
 *  Copyright (C) 2000  Pavel Machek <pavel@ucw.cz>
L
Linus Torvalds 已提交
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
T
Thomas Gleixner 已提交
21
#include <linux/initrd.h>
L
Linus Torvalds 已提交
22 23 24
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
25
#include <linux/pci.h>
26
#include <linux/pfn.h>
27
#include <linux/poison.h>
28
#include <linux/dma-mapping.h>
29 30
#include <linux/module.h>
#include <linux/memory_hotplug.h>
31
#include <linux/nmi.h>
32
#include <linux/gfp.h>
L
Linus Torvalds 已提交
33 34

#include <asm/processor.h>
35
#include <asm/bios_ebda.h>
L
Linus Torvalds 已提交
36 37 38 39 40 41 42 43 44 45 46 47
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
48
#include <asm/sections.h>
49
#include <asm/kdebug.h>
50
#include <asm/numa.h>
51
#include <asm/cacheflush.h>
52
#include <asm/init.h>
53
#include <linux/bootmem.h>
L
Linus Torvalds 已提交
54

55 56
static unsigned long dma_reserve __initdata;

I
Ingo Molnar 已提交
57 58 59 60 61 62 63 64 65 66 67 68 69 70
static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
71 72 73 74 75 76
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

77
pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
78 79 80 81
EXPORT_SYMBOL_GPL(__supported_pte_mask);

int force_personality32;

I
Ingo Molnar 已提交
82 83 84 85 86 87 88 89
/*
 * noexec32=on|off
 * Control non executable heap for 32bit processes.
 * To control the stack too use noexec=off
 *
 * on	PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
 * off	PROT_READ implies PROT_EXEC
 */
90 91 92 93 94 95 96 97 98 99
static int __init nonx32_setup(char *str)
{
	if (!strcmp(str, "on"))
		force_personality32 &= ~READ_IMPLIES_EXEC;
	else if (!strcmp(str, "off"))
		force_personality32 |= READ_IMPLIES_EXEC;
	return 1;
}
__setup("noexec32=", nonx32_setup);

100 101 102 103 104
/*
 * NOTE: This function is marked __ref because it calls __init function
 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
 */
static __ref void *spp_getpage(void)
T
Thomas Gleixner 已提交
105
{
L
Linus Torvalds 已提交
106
	void *ptr;
T
Thomas Gleixner 已提交
107

L
Linus Torvalds 已提交
108
	if (after_bootmem)
109
		ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
L
Linus Torvalds 已提交
110 111
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
112 113 114 115 116

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
117

118
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
119

L
Linus Torvalds 已提交
120
	return ptr;
T
Thomas Gleixner 已提交
121
}
L
Linus Torvalds 已提交
122

123
static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
L
Linus Torvalds 已提交
124
{
125 126 127 128 129 130 131 132 133
	if (pgd_none(*pgd)) {
		pud_t *pud = (pud_t *)spp_getpage();
		pgd_populate(&init_mm, pgd, pud);
		if (pud != pud_offset(pgd, 0))
			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
			       pud, pud_offset(pgd, 0));
	}
	return pud_offset(pgd, vaddr);
}
L
Linus Torvalds 已提交
134

135
static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
136
{
L
Linus Torvalds 已提交
137
	if (pud_none(*pud)) {
138
		pmd_t *pmd = (pmd_t *) spp_getpage();
139
		pud_populate(&init_mm, pud, pmd);
140
		if (pmd != pmd_offset(pud, 0))
141
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
142
			       pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
143
	}
144 145 146
	return pmd_offset(pud, vaddr);
}

147
static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
148
{
L
Linus Torvalds 已提交
149
	if (pmd_none(*pmd)) {
150
		pte_t *pte = (pte_t *) spp_getpage();
151
		pmd_populate_kernel(&init_mm, pmd, pte);
152
		if (pte != pte_offset_kernel(pmd, 0))
153
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
154
	}
155 156 157 158 159 160 161 162 163 164 165 166
	return pte_offset_kernel(pmd, vaddr);
}

void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
{
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	pud = pud_page + pud_index(vaddr);
	pmd = fill_pmd(pud, vaddr);
	pte = fill_pte(pmd, vaddr);
L
Linus Torvalds 已提交
167 168 169 170 171 172 173 174 175 176

	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

177
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
{
	pgd_t *pgd;
	pud_t *pud_page;

	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
		return;
	}
	pud_page = (pud_t*)pgd_page_vaddr(*pgd);
	set_pte_vaddr_pud(pud_page, vaddr, pteval);
}

194
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
195 196 197 198 199
{
	pgd_t *pgd;
	pud_t *pud;

	pgd = pgd_offset_k(vaddr);
200 201 202 203 204 205 206
	pud = fill_pud(pgd, vaddr);
	return fill_pmd(pud, vaddr);
}

pte_t * __init populate_extra_pte(unsigned long vaddr)
{
	pmd_t *pmd;
207

208 209
	pmd = populate_extra_pmd(vaddr);
	return fill_pte(pmd, vaddr);
210 211
}

212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
/*
 * Create large page table mappings for a range of physical addresses.
 */
static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
						pgprot_t prot)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
		pgd = pgd_offset_k((unsigned long)__va(phys));
		if (pgd_none(*pgd)) {
			pud = (pud_t *) spp_getpage();
			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pud = pud_offset(pgd, (unsigned long)__va(phys));
		if (pud_none(*pud)) {
			pmd = (pmd_t *) spp_getpage();
			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pmd = pmd_offset(pud, phys);
		BUG_ON(!pmd_none(*pmd));
		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
	}
}

void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
}

void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
}

252
/*
253 254 255
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
256 257 258 259 260 261 262 263 264 265 266 267
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
 * We limit the mappings to the region from _text to _end.  _end is
 * rounded up to the 2MB boundary. This catches the invalid pmds as
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
268
	unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
269 270 271 272
	pmd_t *pmd = level2_kernel_pgt;
	pmd_t *last_pmd = pmd + PTRS_PER_PMD;

	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
273
		if (pmd_none(*pmd))
274 275 276 277 278 279
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

280
static __ref void *alloc_low_page(unsigned long *phys)
T
Thomas Gleixner 已提交
281
{
282
	unsigned long pfn = e820_table_end++;
L
Linus Torvalds 已提交
283 284
	void *adr;

285
	if (after_bootmem) {
286
		adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
287
		*phys = __pa(adr);
T
Thomas Gleixner 已提交
288

289 290 291
		return adr;
	}

292
	if (pfn >= e820_table_top)
T
Thomas Gleixner 已提交
293
		panic("alloc_low_page: ran out of memory");
294

J
Jeremy Fitzhardinge 已提交
295
	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
296
	clear_page(adr);
297 298 299
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
L
Linus Torvalds 已提交
300

301
static __ref void unmap_low_page(void *adr)
T
Thomas Gleixner 已提交
302
{
303 304 305
	if (after_bootmem)
		return;

306
	early_iounmap(adr, PAGE_SIZE);
T
Thomas Gleixner 已提交
307
}
L
Linus Torvalds 已提交
308

309
static unsigned long __meminit
310 311
phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
	      pgprot_t prot)
312 313
{
	unsigned pages = 0;
314
	unsigned long last_map_addr = end;
315
	int i;
316

317 318 319 320 321 322 323 324 325 326 327 328
	pte_t *pte = pte_page + pte_index(addr);

	for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {

		if (addr >= end) {
			if (!after_bootmem) {
				for(; i < PTRS_PER_PTE; i++, pte++)
					set_pte(pte, __pte(0));
			}
			break;
		}

329 330 331 332 333 334
		/*
		 * We will re-use the existing mapping.
		 * Xen for example has some special requirements, like mapping
		 * pagetable pages as RO. So assume someone who pre-setup
		 * these mappings are more intelligent.
		 */
335 336
		if (pte_val(*pte)) {
			pages++;
337
			continue;
338
		}
339 340 341 342 343

		if (0)
			printk("   pte=%p addr=%lx pte=%016lx\n",
			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
		pages++;
344
		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
345
		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
346
	}
347

348
	update_page_count(PG_LEVEL_4K, pages);
349 350

	return last_map_addr;
351 352
}

353
static unsigned long __meminit
354 355
phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
		pgprot_t prot)
356 357 358
{
	pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);

359
	return phys_pte_init(pte, address, end, prot);
360 361
}

362
static unsigned long __meminit
363
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
364
	      unsigned long page_size_mask, pgprot_t prot)
365
{
366
	unsigned long pages = 0;
367
	unsigned long last_map_addr = end;
368

369
	int i = pmd_index(address);
370

371
	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
372
		unsigned long pte_phys;
373
		pmd_t *pmd = pmd_page + pmd_index(address);
374
		pte_t *pte;
375
		pgprot_t new_prot = prot;
376

377
		if (address >= end) {
T
Thomas Gleixner 已提交
378
			if (!after_bootmem) {
379 380
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
T
Thomas Gleixner 已提交
381
			}
382 383
			break;
		}
384

385
		if (pmd_val(*pmd)) {
386 387
			if (!pmd_large(*pmd)) {
				spin_lock(&init_mm.page_table_lock);
388
				last_map_addr = phys_pte_update(pmd, address,
389
								end, prot);
390
				spin_unlock(&init_mm.page_table_lock);
391
				continue;
392
			}
393 394 395 396 397 398 399 400 401 402 403 404
			/*
			 * If we are ok with PG_LEVEL_2M mapping, then we will
			 * use the existing mapping,
			 *
			 * Otherwise, we will split the large page mapping but
			 * use the same existing protection bits except for
			 * large page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
405 406
			if (page_size_mask & (1 << PG_LEVEL_2M)) {
				pages++;
407
				continue;
408
			}
409
			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
410 411
		}

412
		if (page_size_mask & (1<<PG_LEVEL_2M)) {
413
			pages++;
414
			spin_lock(&init_mm.page_table_lock);
415
			set_pte((pte_t *)pmd,
416 417
				pfn_pte(address >> PAGE_SHIFT,
					__pgprot(pgprot_val(prot) | _PAGE_PSE)));
418
			spin_unlock(&init_mm.page_table_lock);
419
			last_map_addr = (address & PMD_MASK) + PMD_SIZE;
420
			continue;
421
		}
422

423
		pte = alloc_low_page(&pte_phys);
424
		last_map_addr = phys_pte_init(pte, address, end, new_prot);
425 426
		unmap_low_page(pte);

427
		spin_lock(&init_mm.page_table_lock);
428
		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
429
		spin_unlock(&init_mm.page_table_lock);
430
	}
431
	update_page_count(PG_LEVEL_2M, pages);
432
	return last_map_addr;
433 434
}

435
static unsigned long __meminit
436
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
437
		unsigned long page_size_mask, pgprot_t prot)
438
{
T
Thomas Gleixner 已提交
439
	pmd_t *pmd = pmd_offset(pud, 0);
440 441
	unsigned long last_map_addr;

442
	last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
443
	__flush_tlb_all();
444
	return last_map_addr;
445 446
}

447
static unsigned long __meminit
448 449
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
			 unsigned long page_size_mask)
T
Thomas Gleixner 已提交
450
{
451
	unsigned long pages = 0;
452
	unsigned long last_map_addr = end;
453
	int i = pud_index(addr);
454

T
Thomas Gleixner 已提交
455
	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
456 457
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
458
		pmd_t *pmd;
459
		pgprot_t prot = PAGE_KERNEL;
L
Linus Torvalds 已提交
460

461
		if (addr >= end)
L
Linus Torvalds 已提交
462 463
			break;

T
Thomas Gleixner 已提交
464 465 466
		if (!after_bootmem &&
				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
			set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
467
			continue;
T
Thomas Gleixner 已提交
468
		}
L
Linus Torvalds 已提交
469

470
		if (pud_val(*pud)) {
471
			if (!pud_large(*pud)) {
472
				last_map_addr = phys_pmd_update(pud, addr, end,
473
							 page_size_mask, prot);
474 475
				continue;
			}
476 477 478 479 480 481 482 483 484 485 486 487
			/*
			 * If we are ok with PG_LEVEL_1G mapping, then we will
			 * use the existing mapping.
			 *
			 * Otherwise, we will split the gbpage mapping but use
			 * the same existing protection  bits except for large
			 * page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
488 489
			if (page_size_mask & (1 << PG_LEVEL_1G)) {
				pages++;
490
				continue;
491
			}
492
			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
493 494
		}

495
		if (page_size_mask & (1<<PG_LEVEL_1G)) {
496
			pages++;
497
			spin_lock(&init_mm.page_table_lock);
498 499
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
500
			spin_unlock(&init_mm.page_table_lock);
501
			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
502 503 504
			continue;
		}

505
		pmd = alloc_low_page(&pmd_phys);
506 507
		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
					      prot);
508
		unmap_low_page(pmd);
509 510

		spin_lock(&init_mm.page_table_lock);
511
		pud_populate(&init_mm, pud, __va(pmd_phys));
512
		spin_unlock(&init_mm.page_table_lock);
L
Linus Torvalds 已提交
513
	}
A
Andi Kleen 已提交
514
	__flush_tlb_all();
515

516
	update_page_count(PG_LEVEL_1G, pages);
517

518
	return last_map_addr;
T
Thomas Gleixner 已提交
519
}
L
Linus Torvalds 已提交
520

521
static unsigned long __meminit
522 523
phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
		 unsigned long page_size_mask)
524 525 526 527 528
{
	pud_t *pud;

	pud = (pud_t *)pgd_page_vaddr(*pgd);

529
	return phys_pud_init(pud, addr, end, page_size_mask);
530 531
}

532
unsigned long __meminit
533 534 535
kernel_physical_mapping_init(unsigned long start,
			     unsigned long end,
			     unsigned long page_size_mask)
T
Thomas Gleixner 已提交
536
{
L
Linus Torvalds 已提交
537

538
	unsigned long next, last_map_addr = end;
L
Linus Torvalds 已提交
539 540 541 542 543

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);

	for (; start < end; start = next) {
544
		pgd_t *pgd = pgd_offset_k(start);
T
Thomas Gleixner 已提交
545
		unsigned long pud_phys;
546 547
		pud_t *pud;

548
		next = (start + PGDIR_SIZE) & PGDIR_MASK;
549 550 551 552
		if (next > end)
			next = end;

		if (pgd_val(*pgd)) {
553 554
			last_map_addr = phys_pud_update(pgd, __pa(start),
						 __pa(end), page_size_mask);
555 556 557
			continue;
		}

558
		pud = alloc_low_page(&pud_phys);
559 560
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
						 page_size_mask);
561
		unmap_low_page(pud);
562 563 564 565

		spin_lock(&init_mm.page_table_lock);
		pgd_populate(&init_mm, pgd, __va(pud_phys));
		spin_unlock(&init_mm.page_table_lock);
T
Thomas Gleixner 已提交
566
	}
567
	__flush_tlb_all();
L
Linus Torvalds 已提交
568

569 570
	return last_map_addr;
}
571

572
#ifndef CONFIG_NUMA
D
David Rientjes 已提交
573 574
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
				int acpi, int k8)
575
{
576
#ifndef CONFIG_NO_BOOTMEM
577 578 579 580 581 582 583
	unsigned long bootmap_size, bootmap;

	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
				 PAGE_SIZE);
	if (bootmap == -1L)
		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
584
	reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
Y
Yinghai Lu 已提交
585 586 587
	/* don't touch min_low_pfn */
	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
					 0, end_pfn);
588 589
	e820_register_active_regions(0, start_pfn, end_pfn);
	free_bootmem_with_active_regions(0, end_pfn);
590 591 592
#else
	e820_register_active_regions(0, start_pfn, end_pfn);
#endif
593
}
594
#endif
595

L
Linus Torvalds 已提交
596 597
void __init paging_init(void)
{
598
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
599

600 601 602
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
Y
Yinghai Lu 已提交
603
	max_zone_pfns[ZONE_NORMAL] = max_pfn;
604

605
	sparse_memory_present_with_active_regions(MAX_NUMNODES);
606
	sparse_init();
607 608 609 610 611 612 613 614 615

	/*
	 * clear the default setting with node 0
	 * note: don't use nodes_clear here, that is really clearing when
	 *	 numa support is not compiled in, and later node_set_state
	 *	 will not set it back.
	 */
	node_clear_state(0, N_NORMAL_MEMORY);

616
	free_area_init_nodes(max_zone_pfns);
L
Linus Torvalds 已提交
617 618
}

619 620 621
/*
 * Memory hotplug specific functions
 */
622
#ifdef CONFIG_MEMORY_HOTPLUG
623 624 625 626 627 628 629 630 631 632 633 634 635 636 637
/*
 * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
 * updating.
 */
static void  update_end_of_memory_vars(u64 start, u64 size)
{
	unsigned long end_pfn = PFN_UP(start + size);

	if (end_pfn > max_pfn) {
		max_pfn = end_pfn;
		max_low_pfn = end_pfn;
		high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
	}
}

638 639 640 641
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
642
int arch_add_memory(int nid, u64 start, u64 size)
643
{
644
	struct pglist_data *pgdat = NODE_DATA(nid);
645
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
646
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
647 648 649
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

650
	last_mapped_pfn = init_memory_mapping(start, start + size);
651 652
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
653

654
	ret = __add_pages(nid, zone, start_pfn, nr_pages);
655
	WARN_ON_ONCE(ret);
656

657 658 659
	/* update max_pfn, max_low_pfn and high_memory */
	update_end_of_memory_vars(start, size);

660 661
	return ret;
}
662
EXPORT_SYMBOL_GPL(arch_add_memory);
663

664
#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
665 666 667 668
int memory_add_physaddr_to_nid(u64 start)
{
	return 0;
}
669
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
670 671
#endif

672 673
#endif /* CONFIG_MEMORY_HOTPLUG */

674
static struct kcore_list kcore_vsyscall;
L
Linus Torvalds 已提交
675 676 677

void __init mem_init(void)
{
678
	long codesize, reservedpages, datasize, initsize;
679
	unsigned long absent_pages;
L
Linus Torvalds 已提交
680

681
	pci_iommu_alloc();
L
Linus Torvalds 已提交
682

683
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
684 685 686 687

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
688
#ifdef CONFIG_NUMA
689
	totalram_pages = numa_free_all_bootmem();
L
Linus Torvalds 已提交
690
#else
691
	totalram_pages = free_all_bootmem();
L
Linus Torvalds 已提交
692
#endif
693 694 695

	absent_pages = absent_pages_in_range(0, max_pfn);
	reservedpages = max_pfn - totalram_pages - absent_pages;
L
Linus Torvalds 已提交
696 697 698 699 700 701 702
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
703
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
K
KAMEZAWA Hiroyuki 已提交
704
			 VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
L
Linus Torvalds 已提交
705

706
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
707
			 "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
708
		nr_free_pages() << (PAGE_SHIFT-10),
Y
Yinghai Lu 已提交
709
		max_pfn << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
710
		codesize >> 10,
711
		absent_pages << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
712 713 714 715 716
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
}

717
#ifdef CONFIG_DEBUG_RODATA
718 719
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
720

721
int kernel_set_to_readonly;
722 723 724

void set_kernel_text_rw(void)
{
725
	unsigned long start = PFN_ALIGN(_text);
726
	unsigned long end = PFN_ALIGN(__stop___ex_table);
727 728 729 730 731 732 733

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read write\n",
		 start, end);

734 735 736 737 738
	/*
	 * Make the kernel identity mapping for text RW. Kernel text
	 * mapping will always be RO. Refer to the comment in
	 * static_protections() in pageattr.c
	 */
739 740 741 742 743
	set_memory_rw(start, (end - start) >> PAGE_SHIFT);
}

void set_kernel_text_ro(void)
{
744
	unsigned long start = PFN_ALIGN(_text);
745
	unsigned long end = PFN_ALIGN(__stop___ex_table);
746 747 748 749 750 751 752

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read only\n",
		 start, end);

753 754 755
	/*
	 * Set the kernel identity mapping for text RO.
	 */
756 757 758
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
}

759 760
void mark_rodata_ro(void)
{
761
	unsigned long start = PFN_ALIGN(_text);
762 763
	unsigned long rodata_start =
		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
764 765 766 767
	unsigned long end = (unsigned long) &__end_rodata_hpage_align;
	unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
	unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
	unsigned long data_start = (unsigned long) &_sdata;
768

769
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
770
	       (end - start) >> 10);
771 772
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

773 774
	kernel_set_to_readonly = 1;

775 776 777 778
	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
779
	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
780

781 782
	rodata_test();

783
#ifdef CONFIG_CPA_DEBUG
784
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
785
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
786

787
	printk(KERN_INFO "Testing CPA: again\n");
788
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
789
#endif
790 791 792 793 794 795 796 797

	free_init_pages("unused kernel memory",
			(unsigned long) page_address(virt_to_page(text_end)),
			(unsigned long)
				 page_address(virt_to_page(rodata_start)));
	free_init_pages("unused kernel memory",
			(unsigned long) page_address(virt_to_page(rodata_end)),
			(unsigned long) page_address(virt_to_page(data_start)));
798
}
799

800 801
#endif

802 803
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
				   int flags)
T
Thomas Gleixner 已提交
804
{
805
#ifdef CONFIG_NUMA
806
	int nid, next_nid;
807
	int ret;
808 809
#endif
	unsigned long pfn = phys >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
810

Y
Yinghai Lu 已提交
811
	if (pfn >= max_pfn) {
T
Thomas Gleixner 已提交
812 813 814 815
		/*
		 * This can happen with kdump kernels when accessing
		 * firmware tables:
		 */
816
		if (pfn < max_pfn_mapped)
817
			return -EFAULT;
T
Thomas Gleixner 已提交
818

819
		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
820
				phys, len);
821
		return -EFAULT;
822 823 824 825
	}

	/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
826 827 828
	nid = phys_to_nid(phys);
	next_nid = phys_to_nid(phys + len - 1);
	if (nid == next_nid)
829
		ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
830
	else
831 832 833 834 835
		ret = reserve_bootmem(phys, len, flags);

	if (ret != 0)
		return ret;

T
Thomas Gleixner 已提交
836
#else
837
	reserve_bootmem(phys, len, flags);
L
Linus Torvalds 已提交
838
#endif
839

840
	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
841
		dma_reserve += len / PAGE_SIZE;
842 843
		set_dma_reserve(dma_reserve);
	}
844 845

	return 0;
L
Linus Torvalds 已提交
846 847
}

T
Thomas Gleixner 已提交
848 849
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
850
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
851 852 853 854
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
855 856

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
857 858
		return 0;

L
Linus Torvalds 已提交
859 860 861 862 863 864
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
865
		return 0;
L
Linus Torvalds 已提交
866 867 868 869

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
870

L
Linus Torvalds 已提交
871 872 873 874 875 876
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
877

L
Linus Torvalds 已提交
878 879 880
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
881 882 883 884 885
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
886
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
887 888 889 890
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
891 892 893 894 895
};

struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
#ifdef CONFIG_IA32_EMULATION
896 897
	if (test_tsk_thread_flag(tsk, TIF_IA32))
		return NULL;
L
Linus Torvalds 已提交
898 899 900 901 902 903 904
#endif
	return &gate_vma;
}

int in_gate_area(struct task_struct *task, unsigned long addr)
{
	struct vm_area_struct *vma = get_gate_vma(task);
T
Thomas Gleixner 已提交
905

906 907
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
908

L
Linus Torvalds 已提交
909 910 911
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
912 913 914 915
/*
 * Use this when you have no reliable task/vma, typically from interrupt
 * context. It is less reliable than using the task's vma and may give
 * false positives:
L
Linus Torvalds 已提交
916 917 918
 */
int in_gate_area_no_task(unsigned long addr)
{
919
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
920
}
921

922 923 924 925 926 927 928 929
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
930 931 932 933 934

#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
935 936 937 938
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

T
Thomas Gleixner 已提交
939 940
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
941 942 943 944 945 946 947 948 949
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
950
		void *p = NULL;
951 952 953 954

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
955

956 957 958 959
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

960 961 962 963 964 965 966 967
		if (!cpu_has_pse) {
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			pmd = vmemmap_pmd_populate(pud, addr, node);

			if (!pmd)
				return -ENOMEM;

			p = vmemmap_pte_populate(pmd, addr, node);
T
Thomas Gleixner 已提交
968

969 970 971
			if (!p)
				return -ENOMEM;

972 973
			addr_end = addr + PAGE_SIZE;
			p_end = p + PAGE_SIZE;
T
Thomas Gleixner 已提交
974
		} else {
975 976 977 978 979 980
			next = pmd_addr_end(addr, end);

			pmd = pmd_offset(pud, addr);
			if (pmd_none(*pmd)) {
				pte_t entry;

981
				p = vmemmap_alloc_block_buf(PMD_SIZE, node);
982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997
				if (!p)
					return -ENOMEM;

				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
						PAGE_KERNEL_LARGE);
				set_pmd(pmd, __pmd(pte_val(entry)));

				/* check to see if we have contiguous blocks */
				if (p_end != p || node_start != node) {
					if (p_start)
						printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						       addr_start, addr_end-1, p_start, p_end-1, node_start);
					addr_start = addr;
					node_start = node;
					p_start = p;
				}
Y
Yinghai Lu 已提交
998 999 1000

				addr_end = addr + PMD_SIZE;
				p_end = p + PMD_SIZE;
1001 1002
			} else
				vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
1003
		}
1004

1005 1006 1007
	}
	return 0;
}
1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018

void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
1019
#endif