init_64.c 25.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
P
Pavel Machek 已提交
5
 *  Copyright (C) 2000  Pavel Machek <pavel@ucw.cz>
L
Linus Torvalds 已提交
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
T
Thomas Gleixner 已提交
21
#include <linux/initrd.h>
L
Linus Torvalds 已提交
22 23 24
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
25
#include <linux/pci.h>
26
#include <linux/pfn.h>
27
#include <linux/poison.h>
28
#include <linux/dma-mapping.h>
29 30
#include <linux/module.h>
#include <linux/memory_hotplug.h>
31
#include <linux/nmi.h>
32
#include <linux/gfp.h>
L
Linus Torvalds 已提交
33 34

#include <asm/processor.h>
35
#include <asm/bios_ebda.h>
L
Linus Torvalds 已提交
36 37 38 39 40 41 42 43 44 45 46 47
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
48
#include <asm/sections.h>
49
#include <asm/kdebug.h>
50
#include <asm/numa.h>
51
#include <asm/cacheflush.h>
52
#include <asm/init.h>
53
#include <linux/bootmem.h>
L
Linus Torvalds 已提交
54

55 56
static unsigned long dma_reserve __initdata;

I
Ingo Molnar 已提交
57 58 59 60 61 62 63 64 65 66 67 68 69 70
static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
71 72 73 74 75 76
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

77
pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
78 79 80 81
EXPORT_SYMBOL_GPL(__supported_pte_mask);

int force_personality32;

I
Ingo Molnar 已提交
82 83 84 85 86 87 88 89
/*
 * noexec32=on|off
 * Control non executable heap for 32bit processes.
 * To control the stack too use noexec=off
 *
 * on	PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
 * off	PROT_READ implies PROT_EXEC
 */
90 91 92 93 94 95 96 97 98 99
static int __init nonx32_setup(char *str)
{
	if (!strcmp(str, "on"))
		force_personality32 &= ~READ_IMPLIES_EXEC;
	else if (!strcmp(str, "off"))
		force_personality32 |= READ_IMPLIES_EXEC;
	return 1;
}
__setup("noexec32=", nonx32_setup);

100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
/*
 * When memory was added/removed make sure all the processes MM have
 * suitable PGD entries in the local PGD level page.
 */
void sync_global_pgds(unsigned long start, unsigned long end)
{
       unsigned long address;

       for (address = start; address <= end; address += PGDIR_SIZE) {
	       const pgd_t *pgd_ref = pgd_offset_k(address);
	       unsigned long flags;
	       struct page *page;

	       if (pgd_none(*pgd_ref))
		       continue;

	       spin_lock_irqsave(&pgd_lock, flags);
	       list_for_each_entry(page, &pgd_list, lru) {
		       pgd_t *pgd;
		       pgd = (pgd_t *)page_address(page) + pgd_index(address);
		       if (pgd_none(*pgd))
			       set_pgd(pgd, *pgd_ref);
		       else
			       BUG_ON(pgd_page_vaddr(*pgd)
					!= pgd_page_vaddr(*pgd_ref));
	       }
	       spin_unlock_irqrestore(&pgd_lock, flags);
       }
}

130 131 132 133 134
/*
 * NOTE: This function is marked __ref because it calls __init function
 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
 */
static __ref void *spp_getpage(void)
T
Thomas Gleixner 已提交
135
{
L
Linus Torvalds 已提交
136
	void *ptr;
T
Thomas Gleixner 已提交
137

L
Linus Torvalds 已提交
138
	if (after_bootmem)
139
		ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
L
Linus Torvalds 已提交
140 141
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
142 143 144 145 146

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
147

148
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
149

L
Linus Torvalds 已提交
150
	return ptr;
T
Thomas Gleixner 已提交
151
}
L
Linus Torvalds 已提交
152

153
static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
L
Linus Torvalds 已提交
154
{
155 156 157 158 159 160 161 162 163
	if (pgd_none(*pgd)) {
		pud_t *pud = (pud_t *)spp_getpage();
		pgd_populate(&init_mm, pgd, pud);
		if (pud != pud_offset(pgd, 0))
			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
			       pud, pud_offset(pgd, 0));
	}
	return pud_offset(pgd, vaddr);
}
L
Linus Torvalds 已提交
164

165
static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
166
{
L
Linus Torvalds 已提交
167
	if (pud_none(*pud)) {
168
		pmd_t *pmd = (pmd_t *) spp_getpage();
169
		pud_populate(&init_mm, pud, pmd);
170
		if (pmd != pmd_offset(pud, 0))
171
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
172
			       pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
173
	}
174 175 176
	return pmd_offset(pud, vaddr);
}

177
static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
178
{
L
Linus Torvalds 已提交
179
	if (pmd_none(*pmd)) {
180
		pte_t *pte = (pte_t *) spp_getpage();
181
		pmd_populate_kernel(&init_mm, pmd, pte);
182
		if (pte != pte_offset_kernel(pmd, 0))
183
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
184
	}
185 186 187 188 189 190 191 192 193 194 195 196
	return pte_offset_kernel(pmd, vaddr);
}

void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
{
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	pud = pud_page + pud_index(vaddr);
	pmd = fill_pmd(pud, vaddr);
	pte = fill_pte(pmd, vaddr);
L
Linus Torvalds 已提交
197 198 199 200 201 202 203 204 205 206

	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

207
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
{
	pgd_t *pgd;
	pud_t *pud_page;

	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
		return;
	}
	pud_page = (pud_t*)pgd_page_vaddr(*pgd);
	set_pte_vaddr_pud(pud_page, vaddr, pteval);
}

224
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
225 226 227 228 229
{
	pgd_t *pgd;
	pud_t *pud;

	pgd = pgd_offset_k(vaddr);
230 231 232 233 234 235 236
	pud = fill_pud(pgd, vaddr);
	return fill_pmd(pud, vaddr);
}

pte_t * __init populate_extra_pte(unsigned long vaddr)
{
	pmd_t *pmd;
237

238 239
	pmd = populate_extra_pmd(vaddr);
	return fill_pte(pmd, vaddr);
240 241
}

242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281
/*
 * Create large page table mappings for a range of physical addresses.
 */
static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
						pgprot_t prot)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
		pgd = pgd_offset_k((unsigned long)__va(phys));
		if (pgd_none(*pgd)) {
			pud = (pud_t *) spp_getpage();
			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pud = pud_offset(pgd, (unsigned long)__va(phys));
		if (pud_none(*pud)) {
			pmd = (pmd_t *) spp_getpage();
			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pmd = pmd_offset(pud, phys);
		BUG_ON(!pmd_none(*pmd));
		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
	}
}

void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
}

void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
}

282
/*
283 284 285
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
286 287 288 289 290 291 292 293 294 295 296 297
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
 * We limit the mappings to the region from _text to _end.  _end is
 * rounded up to the 2MB boundary. This catches the invalid pmds as
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
298
	unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
299 300 301 302
	pmd_t *pmd = level2_kernel_pgt;
	pmd_t *last_pmd = pmd + PTRS_PER_PMD;

	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
303
		if (pmd_none(*pmd))
304 305 306 307 308 309
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

310
static __ref void *alloc_low_page(unsigned long *phys)
T
Thomas Gleixner 已提交
311
{
312
	unsigned long pfn = e820_table_end++;
L
Linus Torvalds 已提交
313 314
	void *adr;

315
	if (after_bootmem) {
316
		adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
317
		*phys = __pa(adr);
T
Thomas Gleixner 已提交
318

319 320 321
		return adr;
	}

322
	if (pfn >= e820_table_top)
T
Thomas Gleixner 已提交
323
		panic("alloc_low_page: ran out of memory");
324

J
Jeremy Fitzhardinge 已提交
325
	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
326
	memset(adr, 0, PAGE_SIZE);
327 328 329
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
L
Linus Torvalds 已提交
330

331
static __ref void unmap_low_page(void *adr)
T
Thomas Gleixner 已提交
332
{
333 334 335
	if (after_bootmem)
		return;

336
	early_iounmap(adr, PAGE_SIZE);
T
Thomas Gleixner 已提交
337
}
L
Linus Torvalds 已提交
338

339
static unsigned long __meminit
340 341
phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
	      pgprot_t prot)
342 343
{
	unsigned pages = 0;
344
	unsigned long last_map_addr = end;
345
	int i;
346

347 348 349 350 351 352 353 354 355 356 357 358
	pte_t *pte = pte_page + pte_index(addr);

	for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {

		if (addr >= end) {
			if (!after_bootmem) {
				for(; i < PTRS_PER_PTE; i++, pte++)
					set_pte(pte, __pte(0));
			}
			break;
		}

359 360 361 362 363 364
		/*
		 * We will re-use the existing mapping.
		 * Xen for example has some special requirements, like mapping
		 * pagetable pages as RO. So assume someone who pre-setup
		 * these mappings are more intelligent.
		 */
365 366
		if (pte_val(*pte)) {
			pages++;
367
			continue;
368
		}
369 370 371 372 373

		if (0)
			printk("   pte=%p addr=%lx pte=%016lx\n",
			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
		pages++;
374
		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
375
		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
376
	}
377

378
	update_page_count(PG_LEVEL_4K, pages);
379 380

	return last_map_addr;
381 382
}

383
static unsigned long __meminit
384 385
phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
		pgprot_t prot)
386 387 388
{
	pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);

389
	return phys_pte_init(pte, address, end, prot);
390 391
}

392
static unsigned long __meminit
393
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
394
	      unsigned long page_size_mask, pgprot_t prot)
395
{
396
	unsigned long pages = 0;
397
	unsigned long last_map_addr = end;
398

399
	int i = pmd_index(address);
400

401
	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
402
		unsigned long pte_phys;
403
		pmd_t *pmd = pmd_page + pmd_index(address);
404
		pte_t *pte;
405
		pgprot_t new_prot = prot;
406

407
		if (address >= end) {
T
Thomas Gleixner 已提交
408
			if (!after_bootmem) {
409 410
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
T
Thomas Gleixner 已提交
411
			}
412 413
			break;
		}
414

415
		if (pmd_val(*pmd)) {
416 417
			if (!pmd_large(*pmd)) {
				spin_lock(&init_mm.page_table_lock);
418
				last_map_addr = phys_pte_update(pmd, address,
419
								end, prot);
420
				spin_unlock(&init_mm.page_table_lock);
421
				continue;
422
			}
423 424 425 426 427 428 429 430 431 432 433 434
			/*
			 * If we are ok with PG_LEVEL_2M mapping, then we will
			 * use the existing mapping,
			 *
			 * Otherwise, we will split the large page mapping but
			 * use the same existing protection bits except for
			 * large page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
435 436
			if (page_size_mask & (1 << PG_LEVEL_2M)) {
				pages++;
437
				continue;
438
			}
439
			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
440 441
		}

442
		if (page_size_mask & (1<<PG_LEVEL_2M)) {
443
			pages++;
444
			spin_lock(&init_mm.page_table_lock);
445
			set_pte((pte_t *)pmd,
446 447
				pfn_pte(address >> PAGE_SHIFT,
					__pgprot(pgprot_val(prot) | _PAGE_PSE)));
448
			spin_unlock(&init_mm.page_table_lock);
449
			last_map_addr = (address & PMD_MASK) + PMD_SIZE;
450
			continue;
451
		}
452

453
		pte = alloc_low_page(&pte_phys);
454
		last_map_addr = phys_pte_init(pte, address, end, new_prot);
455 456
		unmap_low_page(pte);

457
		spin_lock(&init_mm.page_table_lock);
458
		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
459
		spin_unlock(&init_mm.page_table_lock);
460
	}
461
	update_page_count(PG_LEVEL_2M, pages);
462
	return last_map_addr;
463 464
}

465
static unsigned long __meminit
466
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
467
		unsigned long page_size_mask, pgprot_t prot)
468
{
T
Thomas Gleixner 已提交
469
	pmd_t *pmd = pmd_offset(pud, 0);
470 471
	unsigned long last_map_addr;

472
	last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
473
	__flush_tlb_all();
474
	return last_map_addr;
475 476
}

477
static unsigned long __meminit
478 479
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
			 unsigned long page_size_mask)
T
Thomas Gleixner 已提交
480
{
481
	unsigned long pages = 0;
482
	unsigned long last_map_addr = end;
483
	int i = pud_index(addr);
484

T
Thomas Gleixner 已提交
485
	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
486 487
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
488
		pmd_t *pmd;
489
		pgprot_t prot = PAGE_KERNEL;
L
Linus Torvalds 已提交
490

491
		if (addr >= end)
L
Linus Torvalds 已提交
492 493
			break;

T
Thomas Gleixner 已提交
494 495 496
		if (!after_bootmem &&
				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
			set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
497
			continue;
T
Thomas Gleixner 已提交
498
		}
L
Linus Torvalds 已提交
499

500
		if (pud_val(*pud)) {
501
			if (!pud_large(*pud)) {
502
				last_map_addr = phys_pmd_update(pud, addr, end,
503
							 page_size_mask, prot);
504 505
				continue;
			}
506 507 508 509 510 511 512 513 514 515 516 517
			/*
			 * If we are ok with PG_LEVEL_1G mapping, then we will
			 * use the existing mapping.
			 *
			 * Otherwise, we will split the gbpage mapping but use
			 * the same existing protection  bits except for large
			 * page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
518 519
			if (page_size_mask & (1 << PG_LEVEL_1G)) {
				pages++;
520
				continue;
521
			}
522
			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
523 524
		}

525
		if (page_size_mask & (1<<PG_LEVEL_1G)) {
526
			pages++;
527
			spin_lock(&init_mm.page_table_lock);
528 529
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
530
			spin_unlock(&init_mm.page_table_lock);
531
			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
532 533 534
			continue;
		}

535
		pmd = alloc_low_page(&pmd_phys);
536 537
		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
					      prot);
538
		unmap_low_page(pmd);
539 540

		spin_lock(&init_mm.page_table_lock);
541
		pud_populate(&init_mm, pud, __va(pmd_phys));
542
		spin_unlock(&init_mm.page_table_lock);
L
Linus Torvalds 已提交
543
	}
A
Andi Kleen 已提交
544
	__flush_tlb_all();
545

546
	update_page_count(PG_LEVEL_1G, pages);
547

548
	return last_map_addr;
T
Thomas Gleixner 已提交
549
}
L
Linus Torvalds 已提交
550

551
static unsigned long __meminit
552 553
phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
		 unsigned long page_size_mask)
554 555 556 557 558
{
	pud_t *pud;

	pud = (pud_t *)pgd_page_vaddr(*pgd);

559
	return phys_pud_init(pud, addr, end, page_size_mask);
560 561
}

562
unsigned long __meminit
563 564 565
kernel_physical_mapping_init(unsigned long start,
			     unsigned long end,
			     unsigned long page_size_mask)
T
Thomas Gleixner 已提交
566
{
567
	bool pgd_changed = false;
568
	unsigned long next, last_map_addr = end;
569
	unsigned long addr;
L
Linus Torvalds 已提交
570 571 572

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);
573
	addr = start;
L
Linus Torvalds 已提交
574 575

	for (; start < end; start = next) {
576
		pgd_t *pgd = pgd_offset_k(start);
T
Thomas Gleixner 已提交
577
		unsigned long pud_phys;
578 579
		pud_t *pud;

580
		next = (start + PGDIR_SIZE) & PGDIR_MASK;
581 582 583 584
		if (next > end)
			next = end;

		if (pgd_val(*pgd)) {
585 586
			last_map_addr = phys_pud_update(pgd, __pa(start),
						 __pa(end), page_size_mask);
587 588 589
			continue;
		}

590
		pud = alloc_low_page(&pud_phys);
591 592
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
						 page_size_mask);
593
		unmap_low_page(pud);
594 595 596 597

		spin_lock(&init_mm.page_table_lock);
		pgd_populate(&init_mm, pgd, __va(pud_phys));
		spin_unlock(&init_mm.page_table_lock);
598
		pgd_changed = true;
T
Thomas Gleixner 已提交
599
	}
600 601 602 603

	if (pgd_changed)
		sync_global_pgds(addr, end);

604
	__flush_tlb_all();
L
Linus Torvalds 已提交
605

606 607
	return last_map_addr;
}
608

609
#ifndef CONFIG_NUMA
D
David Rientjes 已提交
610 611
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
				int acpi, int k8)
612
{
613
#ifndef CONFIG_NO_BOOTMEM
614 615 616 617 618 619 620
	unsigned long bootmap_size, bootmap;

	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
				 PAGE_SIZE);
	if (bootmap == -1L)
		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
621
	reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
Y
Yinghai Lu 已提交
622 623 624
	/* don't touch min_low_pfn */
	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
					 0, end_pfn);
625 626
	e820_register_active_regions(0, start_pfn, end_pfn);
	free_bootmem_with_active_regions(0, end_pfn);
627 628 629
#else
	e820_register_active_regions(0, start_pfn, end_pfn);
#endif
630
}
631
#endif
632

L
Linus Torvalds 已提交
633 634
void __init paging_init(void)
{
635
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
636

637 638 639
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
Y
Yinghai Lu 已提交
640
	max_zone_pfns[ZONE_NORMAL] = max_pfn;
641

642
	sparse_memory_present_with_active_regions(MAX_NUMNODES);
643
	sparse_init();
644 645 646 647 648 649 650 651 652

	/*
	 * clear the default setting with node 0
	 * note: don't use nodes_clear here, that is really clearing when
	 *	 numa support is not compiled in, and later node_set_state
	 *	 will not set it back.
	 */
	node_clear_state(0, N_NORMAL_MEMORY);

653
	free_area_init_nodes(max_zone_pfns);
L
Linus Torvalds 已提交
654 655
}

656 657 658
/*
 * Memory hotplug specific functions
 */
659
#ifdef CONFIG_MEMORY_HOTPLUG
660 661 662 663 664 665 666 667 668 669 670 671 672 673 674
/*
 * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
 * updating.
 */
static void  update_end_of_memory_vars(u64 start, u64 size)
{
	unsigned long end_pfn = PFN_UP(start + size);

	if (end_pfn > max_pfn) {
		max_pfn = end_pfn;
		max_low_pfn = end_pfn;
		high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
	}
}

675 676 677 678
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
679
int arch_add_memory(int nid, u64 start, u64 size)
680
{
681
	struct pglist_data *pgdat = NODE_DATA(nid);
682
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
683
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
684 685 686
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

687
	last_mapped_pfn = init_memory_mapping(start, start + size);
688 689
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
690

691
	ret = __add_pages(nid, zone, start_pfn, nr_pages);
692
	WARN_ON_ONCE(ret);
693

694 695 696
	/* update max_pfn, max_low_pfn and high_memory */
	update_end_of_memory_vars(start, size);

697 698
	return ret;
}
699
EXPORT_SYMBOL_GPL(arch_add_memory);
700

701
#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
702 703 704 705
int memory_add_physaddr_to_nid(u64 start)
{
	return 0;
}
706
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
707 708
#endif

709 710
#endif /* CONFIG_MEMORY_HOTPLUG */

711
static struct kcore_list kcore_vsyscall;
L
Linus Torvalds 已提交
712 713 714

void __init mem_init(void)
{
715
	long codesize, reservedpages, datasize, initsize;
716
	unsigned long absent_pages;
L
Linus Torvalds 已提交
717

718
	pci_iommu_alloc();
L
Linus Torvalds 已提交
719

720
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
721 722 723 724

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
725
#ifdef CONFIG_NUMA
726
	totalram_pages = numa_free_all_bootmem();
L
Linus Torvalds 已提交
727
#else
728
	totalram_pages = free_all_bootmem();
L
Linus Torvalds 已提交
729
#endif
730 731 732

	absent_pages = absent_pages_in_range(0, max_pfn);
	reservedpages = max_pfn - totalram_pages - absent_pages;
L
Linus Torvalds 已提交
733 734 735 736 737 738 739
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
740
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
K
KAMEZAWA Hiroyuki 已提交
741
			 VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
L
Linus Torvalds 已提交
742

743
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
744
			 "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
745
		nr_free_pages() << (PAGE_SHIFT-10),
Y
Yinghai Lu 已提交
746
		max_pfn << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
747
		codesize >> 10,
748
		absent_pages << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
749 750 751 752 753
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
}

754
#ifdef CONFIG_DEBUG_RODATA
755 756
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
757

758
int kernel_set_to_readonly;
759 760 761

void set_kernel_text_rw(void)
{
762
	unsigned long start = PFN_ALIGN(_text);
763
	unsigned long end = PFN_ALIGN(__stop___ex_table);
764 765 766 767 768 769 770

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read write\n",
		 start, end);

771 772 773 774 775
	/*
	 * Make the kernel identity mapping for text RW. Kernel text
	 * mapping will always be RO. Refer to the comment in
	 * static_protections() in pageattr.c
	 */
776 777 778 779 780
	set_memory_rw(start, (end - start) >> PAGE_SHIFT);
}

void set_kernel_text_ro(void)
{
781
	unsigned long start = PFN_ALIGN(_text);
782
	unsigned long end = PFN_ALIGN(__stop___ex_table);
783 784 785 786 787 788 789

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read only\n",
		 start, end);

790 791 792
	/*
	 * Set the kernel identity mapping for text RO.
	 */
793 794 795
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
}

796 797
void mark_rodata_ro(void)
{
798
	unsigned long start = PFN_ALIGN(_text);
799 800
	unsigned long rodata_start =
		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
801 802 803 804
	unsigned long end = (unsigned long) &__end_rodata_hpage_align;
	unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
	unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
	unsigned long data_start = (unsigned long) &_sdata;
805

806
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
807
	       (end - start) >> 10);
808 809
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

810 811
	kernel_set_to_readonly = 1;

812 813 814 815
	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
816
	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
817

818 819
	rodata_test();

820
#ifdef CONFIG_CPA_DEBUG
821
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
822
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
823

824
	printk(KERN_INFO "Testing CPA: again\n");
825
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
826
#endif
827 828 829 830 831 832 833 834

	free_init_pages("unused kernel memory",
			(unsigned long) page_address(virt_to_page(text_end)),
			(unsigned long)
				 page_address(virt_to_page(rodata_start)));
	free_init_pages("unused kernel memory",
			(unsigned long) page_address(virt_to_page(rodata_end)),
			(unsigned long) page_address(virt_to_page(data_start)));
835
}
836

837 838
#endif

839 840
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
				   int flags)
T
Thomas Gleixner 已提交
841
{
842
#ifdef CONFIG_NUMA
843
	int nid, next_nid;
844
	int ret;
845 846
#endif
	unsigned long pfn = phys >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
847

Y
Yinghai Lu 已提交
848
	if (pfn >= max_pfn) {
T
Thomas Gleixner 已提交
849 850 851 852
		/*
		 * This can happen with kdump kernels when accessing
		 * firmware tables:
		 */
853
		if (pfn < max_pfn_mapped)
854
			return -EFAULT;
T
Thomas Gleixner 已提交
855

856
		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
857
				phys, len);
858
		return -EFAULT;
859 860 861 862
	}

	/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
863 864 865
	nid = phys_to_nid(phys);
	next_nid = phys_to_nid(phys + len - 1);
	if (nid == next_nid)
866
		ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
867
	else
868 869 870 871 872
		ret = reserve_bootmem(phys, len, flags);

	if (ret != 0)
		return ret;

T
Thomas Gleixner 已提交
873
#else
874
	reserve_bootmem(phys, len, flags);
L
Linus Torvalds 已提交
875
#endif
876

877
	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
878
		dma_reserve += len / PAGE_SIZE;
879 880
		set_dma_reserve(dma_reserve);
	}
881 882

	return 0;
L
Linus Torvalds 已提交
883 884
}

T
Thomas Gleixner 已提交
885 886
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
887
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
888 889 890 891
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
892 893

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
894 895
		return 0;

L
Linus Torvalds 已提交
896 897 898 899 900 901
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
902
		return 0;
L
Linus Torvalds 已提交
903 904 905 906

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
907

L
Linus Torvalds 已提交
908 909 910 911 912 913
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
914

L
Linus Torvalds 已提交
915 916 917
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
918 919 920 921 922
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
923
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
924 925 926 927
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
928 929 930 931 932
};

struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
#ifdef CONFIG_IA32_EMULATION
933 934
	if (test_tsk_thread_flag(tsk, TIF_IA32))
		return NULL;
L
Linus Torvalds 已提交
935 936 937 938 939 940 941
#endif
	return &gate_vma;
}

int in_gate_area(struct task_struct *task, unsigned long addr)
{
	struct vm_area_struct *vma = get_gate_vma(task);
T
Thomas Gleixner 已提交
942

943 944
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
945

L
Linus Torvalds 已提交
946 947 948
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
949 950 951 952
/*
 * Use this when you have no reliable task/vma, typically from interrupt
 * context. It is less reliable than using the task's vma and may give
 * false positives:
L
Linus Torvalds 已提交
953 954 955
 */
int in_gate_area_no_task(unsigned long addr)
{
956
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
957
}
958

959 960 961 962 963 964 965 966
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
967 968 969 970 971

#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
972 973 974 975
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

T
Thomas Gleixner 已提交
976 977
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
978 979 980 981 982 983 984 985 986
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
987
		void *p = NULL;
988 989 990 991

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
992

993 994 995 996
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

997 998 999 1000 1001 1002 1003 1004
		if (!cpu_has_pse) {
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			pmd = vmemmap_pmd_populate(pud, addr, node);

			if (!pmd)
				return -ENOMEM;

			p = vmemmap_pte_populate(pmd, addr, node);
T
Thomas Gleixner 已提交
1005

1006 1007 1008
			if (!p)
				return -ENOMEM;

1009 1010
			addr_end = addr + PAGE_SIZE;
			p_end = p + PAGE_SIZE;
T
Thomas Gleixner 已提交
1011
		} else {
1012 1013 1014 1015 1016 1017
			next = pmd_addr_end(addr, end);

			pmd = pmd_offset(pud, addr);
			if (pmd_none(*pmd)) {
				pte_t entry;

1018
				p = vmemmap_alloc_block_buf(PMD_SIZE, node);
1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034
				if (!p)
					return -ENOMEM;

				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
						PAGE_KERNEL_LARGE);
				set_pmd(pmd, __pmd(pte_val(entry)));

				/* check to see if we have contiguous blocks */
				if (p_end != p || node_start != node) {
					if (p_start)
						printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						       addr_start, addr_end-1, p_start, p_end-1, node_start);
					addr_start = addr;
					node_start = node;
					p_start = p;
				}
Y
Yinghai Lu 已提交
1035 1036 1037

				addr_end = addr + PMD_SIZE;
				p_end = p + PMD_SIZE;
1038 1039
			} else
				vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
1040
		}
1041

1042
	}
1043
	sync_global_pgds((unsigned long)start_page, end);
1044 1045
	return 0;
}
1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056

void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
1057
#endif