init_64.c 24.1 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
P
Pavel Machek 已提交
5
 *  Copyright (C) 2000  Pavel Machek <pavel@ucw.cz>
L
Linus Torvalds 已提交
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
T
Thomas Gleixner 已提交
21
#include <linux/initrd.h>
L
Linus Torvalds 已提交
22 23
#include <linux/pagemap.h>
#include <linux/bootmem.h>
24
#include <linux/memblock.h>
L
Linus Torvalds 已提交
25
#include <linux/proc_fs.h>
26
#include <linux/pci.h>
27
#include <linux/pfn.h>
28
#include <linux/poison.h>
29
#include <linux/dma-mapping.h>
30 31
#include <linux/module.h>
#include <linux/memory_hotplug.h>
32
#include <linux/nmi.h>
33
#include <linux/gfp.h>
L
Linus Torvalds 已提交
34 35

#include <asm/processor.h>
36
#include <asm/bios_ebda.h>
L
Linus Torvalds 已提交
37 38 39 40 41 42 43 44 45 46 47 48
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
49
#include <asm/sections.h>
50
#include <asm/kdebug.h>
51
#include <asm/numa.h>
52
#include <asm/cacheflush.h>
53
#include <asm/init.h>
54
#include <asm/uv/uv.h>
L
Linus Torvalds 已提交
55

I
Ingo Molnar 已提交
56 57 58 59 60 61 62 63 64 65 66 67 68 69
static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
70 71 72 73 74 75
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

76
pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
77 78 79 80
EXPORT_SYMBOL_GPL(__supported_pte_mask);

int force_personality32;

I
Ingo Molnar 已提交
81 82 83 84 85 86 87 88
/*
 * noexec32=on|off
 * Control non executable heap for 32bit processes.
 * To control the stack too use noexec=off
 *
 * on	PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
 * off	PROT_READ implies PROT_EXEC
 */
89 90 91 92 93 94 95 96 97 98
static int __init nonx32_setup(char *str)
{
	if (!strcmp(str, "on"))
		force_personality32 &= ~READ_IMPLIES_EXEC;
	else if (!strcmp(str, "off"))
		force_personality32 |= READ_IMPLIES_EXEC;
	return 1;
}
__setup("noexec32=", nonx32_setup);

99 100 101 102 103 104
/*
 * When memory was added/removed make sure all the processes MM have
 * suitable PGD entries in the local PGD level page.
 */
void sync_global_pgds(unsigned long start, unsigned long end)
{
105 106 107 108 109 110 111 112 113
	unsigned long address;

	for (address = start; address <= end; address += PGDIR_SIZE) {
		const pgd_t *pgd_ref = pgd_offset_k(address);
		struct page *page;

		if (pgd_none(*pgd_ref))
			continue;

A
Andrea Arcangeli 已提交
114
		spin_lock(&pgd_lock);
115 116
		list_for_each_entry(page, &pgd_list, lru) {
			pgd_t *pgd;
117 118
			spinlock_t *pgt_lock;

119
			pgd = (pgd_t *)page_address(page) + pgd_index(address);
A
Andrea Arcangeli 已提交
120
			/* the pgt_lock only for Xen */
121 122 123
			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
			spin_lock(pgt_lock);

124 125 126 127 128
			if (pgd_none(*pgd))
				set_pgd(pgd, *pgd_ref);
			else
				BUG_ON(pgd_page_vaddr(*pgd)
				       != pgd_page_vaddr(*pgd_ref));
129 130

			spin_unlock(pgt_lock);
131
		}
A
Andrea Arcangeli 已提交
132
		spin_unlock(&pgd_lock);
133
	}
134 135
}

136 137 138 139 140
/*
 * NOTE: This function is marked __ref because it calls __init function
 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
 */
static __ref void *spp_getpage(void)
T
Thomas Gleixner 已提交
141
{
L
Linus Torvalds 已提交
142
	void *ptr;
T
Thomas Gleixner 已提交
143

L
Linus Torvalds 已提交
144
	if (after_bootmem)
145
		ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
L
Linus Torvalds 已提交
146 147
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
148 149 150 151 152

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
153

154
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
155

L
Linus Torvalds 已提交
156
	return ptr;
T
Thomas Gleixner 已提交
157
}
L
Linus Torvalds 已提交
158

159
static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
L
Linus Torvalds 已提交
160
{
161 162 163 164 165 166 167 168 169
	if (pgd_none(*pgd)) {
		pud_t *pud = (pud_t *)spp_getpage();
		pgd_populate(&init_mm, pgd, pud);
		if (pud != pud_offset(pgd, 0))
			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
			       pud, pud_offset(pgd, 0));
	}
	return pud_offset(pgd, vaddr);
}
L
Linus Torvalds 已提交
170

171
static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
172
{
L
Linus Torvalds 已提交
173
	if (pud_none(*pud)) {
174
		pmd_t *pmd = (pmd_t *) spp_getpage();
175
		pud_populate(&init_mm, pud, pmd);
176
		if (pmd != pmd_offset(pud, 0))
177
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
178
			       pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
179
	}
180 181 182
	return pmd_offset(pud, vaddr);
}

183
static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
184
{
L
Linus Torvalds 已提交
185
	if (pmd_none(*pmd)) {
186
		pte_t *pte = (pte_t *) spp_getpage();
187
		pmd_populate_kernel(&init_mm, pmd, pte);
188
		if (pte != pte_offset_kernel(pmd, 0))
189
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
190
	}
191 192 193 194 195 196 197 198 199 200 201 202
	return pte_offset_kernel(pmd, vaddr);
}

void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
{
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	pud = pud_page + pud_index(vaddr);
	pmd = fill_pmd(pud, vaddr);
	pte = fill_pte(pmd, vaddr);
L
Linus Torvalds 已提交
203 204 205 206 207 208 209 210 211 212

	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

213
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229
{
	pgd_t *pgd;
	pud_t *pud_page;

	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
		return;
	}
	pud_page = (pud_t*)pgd_page_vaddr(*pgd);
	set_pte_vaddr_pud(pud_page, vaddr, pteval);
}

230
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
231 232 233 234 235
{
	pgd_t *pgd;
	pud_t *pud;

	pgd = pgd_offset_k(vaddr);
236 237 238 239 240 241 242
	pud = fill_pud(pgd, vaddr);
	return fill_pmd(pud, vaddr);
}

pte_t * __init populate_extra_pte(unsigned long vaddr)
{
	pmd_t *pmd;
243

244 245
	pmd = populate_extra_pmd(vaddr);
	return fill_pte(pmd, vaddr);
246 247
}

248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
/*
 * Create large page table mappings for a range of physical addresses.
 */
static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
						pgprot_t prot)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
		pgd = pgd_offset_k((unsigned long)__va(phys));
		if (pgd_none(*pgd)) {
			pud = (pud_t *) spp_getpage();
			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pud = pud_offset(pgd, (unsigned long)__va(phys));
		if (pud_none(*pud)) {
			pmd = (pmd_t *) spp_getpage();
			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pmd = pmd_offset(pud, phys);
		BUG_ON(!pmd_none(*pmd));
		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
	}
}

void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
}

void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
}

288
/*
289 290 291
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
292 293 294 295 296 297 298 299 300 301 302 303
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
 * We limit the mappings to the region from _text to _end.  _end is
 * rounded up to the 2MB boundary. This catches the invalid pmds as
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
304
	unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
305 306 307 308
	pmd_t *pmd = level2_kernel_pgt;
	pmd_t *last_pmd = pmd + PTRS_PER_PMD;

	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
309
		if (pmd_none(*pmd))
310 311 312 313 314 315
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

316
static __ref void *alloc_low_page(unsigned long *phys)
T
Thomas Gleixner 已提交
317
{
318
	unsigned long pfn = pgt_buf_end++;
L
Linus Torvalds 已提交
319 320
	void *adr;

321
	if (after_bootmem) {
322
		adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
323
		*phys = __pa(adr);
T
Thomas Gleixner 已提交
324

325 326 327
		return adr;
	}

328
	if (pfn >= pgt_buf_top)
T
Thomas Gleixner 已提交
329
		panic("alloc_low_page: ran out of memory");
330

J
Jeremy Fitzhardinge 已提交
331
	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
332
	clear_page(adr);
333 334 335
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
L
Linus Torvalds 已提交
336

Y
Yinghai Lu 已提交
337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352
static __ref void *map_low_page(void *virt)
{
	void *adr;
	unsigned long phys, left;

	if (after_bootmem)
		return virt;

	phys = __pa(virt);
	left = phys & (PAGE_SIZE - 1);
	adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
	adr = (void *)(((unsigned long)adr) | left);

	return adr;
}

353
static __ref void unmap_low_page(void *adr)
T
Thomas Gleixner 已提交
354
{
355 356 357
	if (after_bootmem)
		return;

Y
Yinghai Lu 已提交
358
	early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
T
Thomas Gleixner 已提交
359
}
L
Linus Torvalds 已提交
360

361
static unsigned long __meminit
362 363
phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
	      pgprot_t prot)
364 365
{
	unsigned pages = 0;
366
	unsigned long last_map_addr = end;
367
	int i;
368

369 370 371 372 373 374 375 376 377 378 379 380
	pte_t *pte = pte_page + pte_index(addr);

	for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {

		if (addr >= end) {
			if (!after_bootmem) {
				for(; i < PTRS_PER_PTE; i++, pte++)
					set_pte(pte, __pte(0));
			}
			break;
		}

381 382 383 384 385 386
		/*
		 * We will re-use the existing mapping.
		 * Xen for example has some special requirements, like mapping
		 * pagetable pages as RO. So assume someone who pre-setup
		 * these mappings are more intelligent.
		 */
387 388
		if (pte_val(*pte)) {
			pages++;
389
			continue;
390
		}
391 392 393 394 395

		if (0)
			printk("   pte=%p addr=%lx pte=%016lx\n",
			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
		pages++;
396
		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
397
		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
398
	}
399

400
	update_page_count(PG_LEVEL_4K, pages);
401 402

	return last_map_addr;
403 404
}

405
static unsigned long __meminit
406
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
407
	      unsigned long page_size_mask, pgprot_t prot)
408
{
409
	unsigned long pages = 0;
410
	unsigned long last_map_addr = end;
411

412
	int i = pmd_index(address);
413

414
	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
415
		unsigned long pte_phys;
416
		pmd_t *pmd = pmd_page + pmd_index(address);
417
		pte_t *pte;
418
		pgprot_t new_prot = prot;
419

420
		if (address >= end) {
T
Thomas Gleixner 已提交
421
			if (!after_bootmem) {
422 423
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
T
Thomas Gleixner 已提交
424
			}
425 426
			break;
		}
427

428
		if (pmd_val(*pmd)) {
429 430
			if (!pmd_large(*pmd)) {
				spin_lock(&init_mm.page_table_lock);
Y
Yinghai Lu 已提交
431 432
				pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
				last_map_addr = phys_pte_init(pte, address,
433
								end, prot);
Y
Yinghai Lu 已提交
434
				unmap_low_page(pte);
435
				spin_unlock(&init_mm.page_table_lock);
436
				continue;
437
			}
438 439 440 441 442 443 444 445 446 447 448 449
			/*
			 * If we are ok with PG_LEVEL_2M mapping, then we will
			 * use the existing mapping,
			 *
			 * Otherwise, we will split the large page mapping but
			 * use the same existing protection bits except for
			 * large page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
450 451
			if (page_size_mask & (1 << PG_LEVEL_2M)) {
				pages++;
452
				continue;
453
			}
454
			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
455 456
		}

457
		if (page_size_mask & (1<<PG_LEVEL_2M)) {
458
			pages++;
459
			spin_lock(&init_mm.page_table_lock);
460
			set_pte((pte_t *)pmd,
461 462
				pfn_pte(address >> PAGE_SHIFT,
					__pgprot(pgprot_val(prot) | _PAGE_PSE)));
463
			spin_unlock(&init_mm.page_table_lock);
464
			last_map_addr = (address & PMD_MASK) + PMD_SIZE;
465
			continue;
466
		}
467

468
		pte = alloc_low_page(&pte_phys);
469
		last_map_addr = phys_pte_init(pte, address, end, new_prot);
470 471
		unmap_low_page(pte);

472
		spin_lock(&init_mm.page_table_lock);
473
		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
474
		spin_unlock(&init_mm.page_table_lock);
475
	}
476
	update_page_count(PG_LEVEL_2M, pages);
477
	return last_map_addr;
478 479
}

480
static unsigned long __meminit
481 482
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
			 unsigned long page_size_mask)
T
Thomas Gleixner 已提交
483
{
484
	unsigned long pages = 0;
485
	unsigned long last_map_addr = end;
486
	int i = pud_index(addr);
487

T
Thomas Gleixner 已提交
488
	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
489 490
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
491
		pmd_t *pmd;
492
		pgprot_t prot = PAGE_KERNEL;
L
Linus Torvalds 已提交
493

494
		if (addr >= end)
L
Linus Torvalds 已提交
495 496
			break;

T
Thomas Gleixner 已提交
497 498 499
		if (!after_bootmem &&
				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
			set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
500
			continue;
T
Thomas Gleixner 已提交
501
		}
L
Linus Torvalds 已提交
502

503
		if (pud_val(*pud)) {
504
			if (!pud_large(*pud)) {
Y
Yinghai Lu 已提交
505 506
				pmd = map_low_page(pmd_offset(pud, 0));
				last_map_addr = phys_pmd_init(pmd, addr, end,
507
							 page_size_mask, prot);
Y
Yinghai Lu 已提交
508 509
				unmap_low_page(pmd);
				__flush_tlb_all();
510 511
				continue;
			}
512 513 514 515 516 517 518 519 520 521 522 523
			/*
			 * If we are ok with PG_LEVEL_1G mapping, then we will
			 * use the existing mapping.
			 *
			 * Otherwise, we will split the gbpage mapping but use
			 * the same existing protection  bits except for large
			 * page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
524 525
			if (page_size_mask & (1 << PG_LEVEL_1G)) {
				pages++;
526
				continue;
527
			}
528
			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
529 530
		}

531
		if (page_size_mask & (1<<PG_LEVEL_1G)) {
532
			pages++;
533
			spin_lock(&init_mm.page_table_lock);
534 535
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
536
			spin_unlock(&init_mm.page_table_lock);
537
			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
538 539 540
			continue;
		}

541
		pmd = alloc_low_page(&pmd_phys);
542 543
		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
					      prot);
544
		unmap_low_page(pmd);
545 546

		spin_lock(&init_mm.page_table_lock);
547
		pud_populate(&init_mm, pud, __va(pmd_phys));
548
		spin_unlock(&init_mm.page_table_lock);
L
Linus Torvalds 已提交
549
	}
A
Andi Kleen 已提交
550
	__flush_tlb_all();
551

552
	update_page_count(PG_LEVEL_1G, pages);
553

554
	return last_map_addr;
T
Thomas Gleixner 已提交
555
}
L
Linus Torvalds 已提交
556

557
unsigned long __meminit
558 559 560
kernel_physical_mapping_init(unsigned long start,
			     unsigned long end,
			     unsigned long page_size_mask)
T
Thomas Gleixner 已提交
561
{
562
	bool pgd_changed = false;
563
	unsigned long next, last_map_addr = end;
564
	unsigned long addr;
L
Linus Torvalds 已提交
565 566 567

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);
568
	addr = start;
L
Linus Torvalds 已提交
569 570

	for (; start < end; start = next) {
571
		pgd_t *pgd = pgd_offset_k(start);
T
Thomas Gleixner 已提交
572
		unsigned long pud_phys;
573 574
		pud_t *pud;

575
		next = (start + PGDIR_SIZE) & PGDIR_MASK;
576 577 578 579
		if (next > end)
			next = end;

		if (pgd_val(*pgd)) {
Y
Yinghai Lu 已提交
580 581
			pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
			last_map_addr = phys_pud_init(pud, __pa(start),
582
						 __pa(end), page_size_mask);
Y
Yinghai Lu 已提交
583
			unmap_low_page(pud);
584 585 586
			continue;
		}

587
		pud = alloc_low_page(&pud_phys);
588 589
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
						 page_size_mask);
590
		unmap_low_page(pud);
591 592 593 594

		spin_lock(&init_mm.page_table_lock);
		pgd_populate(&init_mm, pgd, __va(pud_phys));
		spin_unlock(&init_mm.page_table_lock);
595
		pgd_changed = true;
T
Thomas Gleixner 已提交
596
	}
597 598 599 600

	if (pgd_changed)
		sync_global_pgds(addr, end);

601
	__flush_tlb_all();
L
Linus Torvalds 已提交
602

603 604
	return last_map_addr;
}
605

606
#ifndef CONFIG_NUMA
607
void __init initmem_init(void)
608
{
609
	memblock_x86_register_active_regions(0, 0, max_pfn);
610
}
611
#endif
612

L
Linus Torvalds 已提交
613 614
void __init paging_init(void)
{
615
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
616

617 618 619
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
Y
Yinghai Lu 已提交
620
	max_zone_pfns[ZONE_NORMAL] = max_pfn;
621

622
	sparse_memory_present_with_active_regions(MAX_NUMNODES);
623
	sparse_init();
624 625 626 627 628 629 630 631 632

	/*
	 * clear the default setting with node 0
	 * note: don't use nodes_clear here, that is really clearing when
	 *	 numa support is not compiled in, and later node_set_state
	 *	 will not set it back.
	 */
	node_clear_state(0, N_NORMAL_MEMORY);

633
	free_area_init_nodes(max_zone_pfns);
L
Linus Torvalds 已提交
634 635
}

636 637 638
/*
 * Memory hotplug specific functions
 */
639
#ifdef CONFIG_MEMORY_HOTPLUG
640 641 642 643 644 645 646 647 648 649 650 651 652 653 654
/*
 * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
 * updating.
 */
static void  update_end_of_memory_vars(u64 start, u64 size)
{
	unsigned long end_pfn = PFN_UP(start + size);

	if (end_pfn > max_pfn) {
		max_pfn = end_pfn;
		max_low_pfn = end_pfn;
		high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
	}
}

655 656 657 658
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
659
int arch_add_memory(int nid, u64 start, u64 size)
660
{
661
	struct pglist_data *pgdat = NODE_DATA(nid);
662
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
663
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
664 665 666
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

667
	last_mapped_pfn = init_memory_mapping(start, start + size);
668 669
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
670

671
	ret = __add_pages(nid, zone, start_pfn, nr_pages);
672
	WARN_ON_ONCE(ret);
673

674 675 676
	/* update max_pfn, max_low_pfn and high_memory */
	update_end_of_memory_vars(start, size);

677 678
	return ret;
}
679
EXPORT_SYMBOL_GPL(arch_add_memory);
680

681
#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
682 683 684 685
int memory_add_physaddr_to_nid(u64 start)
{
	return 0;
}
686
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
687 688
#endif

689 690
#endif /* CONFIG_MEMORY_HOTPLUG */

691
static struct kcore_list kcore_vsyscall;
L
Linus Torvalds 已提交
692 693 694

void __init mem_init(void)
{
695
	long codesize, reservedpages, datasize, initsize;
696
	unsigned long absent_pages;
L
Linus Torvalds 已提交
697

698
	pci_iommu_alloc();
L
Linus Torvalds 已提交
699

700
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
701 702 703 704

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
705
#ifdef CONFIG_NUMA
706
	totalram_pages = numa_free_all_bootmem();
L
Linus Torvalds 已提交
707
#else
708
	totalram_pages = free_all_bootmem();
L
Linus Torvalds 已提交
709
#endif
710 711 712

	absent_pages = absent_pages_in_range(0, max_pfn);
	reservedpages = max_pfn - totalram_pages - absent_pages;
L
Linus Torvalds 已提交
713 714 715 716 717 718 719
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
720
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
K
KAMEZAWA Hiroyuki 已提交
721
			 VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
L
Linus Torvalds 已提交
722

723
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
724
			 "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
725
		nr_free_pages() << (PAGE_SHIFT-10),
Y
Yinghai Lu 已提交
726
		max_pfn << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
727
		codesize >> 10,
728
		absent_pages << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
729 730 731 732 733
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
}

734
#ifdef CONFIG_DEBUG_RODATA
735 736
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
737

738
int kernel_set_to_readonly;
739 740 741

void set_kernel_text_rw(void)
{
742
	unsigned long start = PFN_ALIGN(_text);
743
	unsigned long end = PFN_ALIGN(__stop___ex_table);
744 745 746 747 748 749 750

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read write\n",
		 start, end);

751 752 753 754 755
	/*
	 * Make the kernel identity mapping for text RW. Kernel text
	 * mapping will always be RO. Refer to the comment in
	 * static_protections() in pageattr.c
	 */
756 757 758 759 760
	set_memory_rw(start, (end - start) >> PAGE_SHIFT);
}

void set_kernel_text_ro(void)
{
761
	unsigned long start = PFN_ALIGN(_text);
762
	unsigned long end = PFN_ALIGN(__stop___ex_table);
763 764 765 766 767 768 769

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read only\n",
		 start, end);

770 771 772
	/*
	 * Set the kernel identity mapping for text RO.
	 */
773 774 775
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
}

776 777
void mark_rodata_ro(void)
{
778
	unsigned long start = PFN_ALIGN(_text);
779 780
	unsigned long rodata_start =
		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
781 782 783 784
	unsigned long end = (unsigned long) &__end_rodata_hpage_align;
	unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
	unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
	unsigned long data_start = (unsigned long) &_sdata;
785

786
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
787
	       (end - start) >> 10);
788 789
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

790 791
	kernel_set_to_readonly = 1;

792 793 794 795
	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
796
	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
797

798 799
	rodata_test();

800
#ifdef CONFIG_CPA_DEBUG
801
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
802
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
803

804
	printk(KERN_INFO "Testing CPA: again\n");
805
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
806
#endif
807 808 809 810 811 812 813 814

	free_init_pages("unused kernel memory",
			(unsigned long) page_address(virt_to_page(text_end)),
			(unsigned long)
				 page_address(virt_to_page(rodata_start)));
	free_init_pages("unused kernel memory",
			(unsigned long) page_address(virt_to_page(rodata_end)),
			(unsigned long) page_address(virt_to_page(data_start)));
815
}
816

817 818
#endif

T
Thomas Gleixner 已提交
819 820
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
821
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
822 823 824 825
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
826 827

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
828 829
		return 0;

L
Linus Torvalds 已提交
830 831 832 833 834 835
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
836
		return 0;
L
Linus Torvalds 已提交
837 838 839 840

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
841

L
Linus Torvalds 已提交
842 843 844 845 846 847
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
848

L
Linus Torvalds 已提交
849 850 851
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
852 853 854 855 856
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
857
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
858 859 860 861
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
862 863
};

864
struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
L
Linus Torvalds 已提交
865 866
{
#ifdef CONFIG_IA32_EMULATION
867
	if (!mm || mm->context.ia32_compat)
868
		return NULL;
L
Linus Torvalds 已提交
869 870 871 872 873 874
#endif
	return &gate_vma;
}

int in_gate_area(struct task_struct *task, unsigned long addr)
{
875
	struct vm_area_struct *vma = get_gate_vma(task->mm);
T
Thomas Gleixner 已提交
876

877 878
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
879

L
Linus Torvalds 已提交
880 881 882
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
883 884 885 886
/*
 * Use this when you have no reliable task/vma, typically from interrupt
 * context. It is less reliable than using the task's vma and may give
 * false positives:
L
Linus Torvalds 已提交
887 888 889
 */
int in_gate_area_no_task(unsigned long addr)
{
890
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
891
}
892

893 894 895 896 897 898 899 900
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
901

902 903 904 905 906 907 908 909 910 911 912 913 914
#ifdef CONFIG_X86_UV
#define MIN_MEMORY_BLOCK_SIZE   (1 << SECTION_SIZE_BITS)

unsigned long memory_block_size_bytes(void)
{
	if (is_uv_system()) {
		printk(KERN_INFO "UV: memory block size 2GB\n");
		return 2UL * 1024 * 1024 * 1024;
	}
	return MIN_MEMORY_BLOCK_SIZE;
}
#endif

915 916 917 918
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
919 920 921 922
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

T
Thomas Gleixner 已提交
923 924
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
925 926 927 928 929 930 931 932 933
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
934
		void *p = NULL;
935 936 937 938

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
939

940 941 942 943
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

944 945 946 947 948 949 950 951
		if (!cpu_has_pse) {
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			pmd = vmemmap_pmd_populate(pud, addr, node);

			if (!pmd)
				return -ENOMEM;

			p = vmemmap_pte_populate(pmd, addr, node);
T
Thomas Gleixner 已提交
952

953 954 955
			if (!p)
				return -ENOMEM;

956 957
			addr_end = addr + PAGE_SIZE;
			p_end = p + PAGE_SIZE;
T
Thomas Gleixner 已提交
958
		} else {
959 960 961 962 963 964
			next = pmd_addr_end(addr, end);

			pmd = pmd_offset(pud, addr);
			if (pmd_none(*pmd)) {
				pte_t entry;

965
				p = vmemmap_alloc_block_buf(PMD_SIZE, node);
966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981
				if (!p)
					return -ENOMEM;

				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
						PAGE_KERNEL_LARGE);
				set_pmd(pmd, __pmd(pte_val(entry)));

				/* check to see if we have contiguous blocks */
				if (p_end != p || node_start != node) {
					if (p_start)
						printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						       addr_start, addr_end-1, p_start, p_end-1, node_start);
					addr_start = addr;
					node_start = node;
					p_start = p;
				}
Y
Yinghai Lu 已提交
982 983 984

				addr_end = addr + PMD_SIZE;
				p_end = p + PMD_SIZE;
985 986
			} else
				vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
987
		}
988

989
	}
990
	sync_global_pgds((unsigned long)start_page, end);
991 992
	return 0;
}
993 994 995 996 997 998 999 1000 1001 1002 1003

void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
1004
#endif