init_64.c 24.1 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
P
Pavel Machek 已提交
5
 *  Copyright (C) 2000  Pavel Machek <pavel@ucw.cz>
L
Linus Torvalds 已提交
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
T
Thomas Gleixner 已提交
21
#include <linux/initrd.h>
L
Linus Torvalds 已提交
22 23
#include <linux/pagemap.h>
#include <linux/bootmem.h>
24
#include <linux/memblock.h>
L
Linus Torvalds 已提交
25
#include <linux/proc_fs.h>
26
#include <linux/pci.h>
27
#include <linux/pfn.h>
28
#include <linux/poison.h>
29
#include <linux/dma-mapping.h>
30
#include <linux/module.h>
31
#include <linux/memory.h>
32
#include <linux/memory_hotplug.h>
33
#include <linux/nmi.h>
34
#include <linux/gfp.h>
L
Linus Torvalds 已提交
35 36

#include <asm/processor.h>
37
#include <asm/bios_ebda.h>
L
Linus Torvalds 已提交
38 39 40 41 42 43 44 45 46 47 48 49
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
50
#include <asm/sections.h>
51
#include <asm/kdebug.h>
52
#include <asm/numa.h>
53
#include <asm/cacheflush.h>
54
#include <asm/init.h>
55
#include <asm/uv/uv.h>
56
#include <asm/setup.h>
L
Linus Torvalds 已提交
57

I
Ingo Molnar 已提交
58 59 60 61 62 63 64 65 66 67 68 69 70 71
static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
72 73 74 75 76 77
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

78
pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
79 80 81 82
EXPORT_SYMBOL_GPL(__supported_pte_mask);

int force_personality32;

I
Ingo Molnar 已提交
83 84 85 86 87 88 89 90
/*
 * noexec32=on|off
 * Control non executable heap for 32bit processes.
 * To control the stack too use noexec=off
 *
 * on	PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
 * off	PROT_READ implies PROT_EXEC
 */
91 92 93 94 95 96 97 98 99 100
static int __init nonx32_setup(char *str)
{
	if (!strcmp(str, "on"))
		force_personality32 &= ~READ_IMPLIES_EXEC;
	else if (!strcmp(str, "off"))
		force_personality32 |= READ_IMPLIES_EXEC;
	return 1;
}
__setup("noexec32=", nonx32_setup);

101 102 103 104 105 106
/*
 * When memory was added/removed make sure all the processes MM have
 * suitable PGD entries in the local PGD level page.
 */
void sync_global_pgds(unsigned long start, unsigned long end)
{
107 108 109 110 111 112 113 114 115
	unsigned long address;

	for (address = start; address <= end; address += PGDIR_SIZE) {
		const pgd_t *pgd_ref = pgd_offset_k(address);
		struct page *page;

		if (pgd_none(*pgd_ref))
			continue;

A
Andrea Arcangeli 已提交
116
		spin_lock(&pgd_lock);
117 118
		list_for_each_entry(page, &pgd_list, lru) {
			pgd_t *pgd;
119 120
			spinlock_t *pgt_lock;

121
			pgd = (pgd_t *)page_address(page) + pgd_index(address);
A
Andrea Arcangeli 已提交
122
			/* the pgt_lock only for Xen */
123 124 125
			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
			spin_lock(pgt_lock);

126 127 128 129 130
			if (pgd_none(*pgd))
				set_pgd(pgd, *pgd_ref);
			else
				BUG_ON(pgd_page_vaddr(*pgd)
				       != pgd_page_vaddr(*pgd_ref));
131 132

			spin_unlock(pgt_lock);
133
		}
A
Andrea Arcangeli 已提交
134
		spin_unlock(&pgd_lock);
135
	}
136 137
}

138 139 140 141 142
/*
 * NOTE: This function is marked __ref because it calls __init function
 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
 */
static __ref void *spp_getpage(void)
T
Thomas Gleixner 已提交
143
{
L
Linus Torvalds 已提交
144
	void *ptr;
T
Thomas Gleixner 已提交
145

L
Linus Torvalds 已提交
146
	if (after_bootmem)
147
		ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
L
Linus Torvalds 已提交
148 149
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
150 151 152 153 154

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
155

156
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
157

L
Linus Torvalds 已提交
158
	return ptr;
T
Thomas Gleixner 已提交
159
}
L
Linus Torvalds 已提交
160

161
static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
L
Linus Torvalds 已提交
162
{
163 164 165 166 167 168 169 170 171
	if (pgd_none(*pgd)) {
		pud_t *pud = (pud_t *)spp_getpage();
		pgd_populate(&init_mm, pgd, pud);
		if (pud != pud_offset(pgd, 0))
			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
			       pud, pud_offset(pgd, 0));
	}
	return pud_offset(pgd, vaddr);
}
L
Linus Torvalds 已提交
172

173
static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
174
{
L
Linus Torvalds 已提交
175
	if (pud_none(*pud)) {
176
		pmd_t *pmd = (pmd_t *) spp_getpage();
177
		pud_populate(&init_mm, pud, pmd);
178
		if (pmd != pmd_offset(pud, 0))
179
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
180
			       pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
181
	}
182 183 184
	return pmd_offset(pud, vaddr);
}

185
static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
186
{
L
Linus Torvalds 已提交
187
	if (pmd_none(*pmd)) {
188
		pte_t *pte = (pte_t *) spp_getpage();
189
		pmd_populate_kernel(&init_mm, pmd, pte);
190
		if (pte != pte_offset_kernel(pmd, 0))
191
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
192
	}
193 194 195 196 197 198 199 200 201 202 203 204
	return pte_offset_kernel(pmd, vaddr);
}

void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
{
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	pud = pud_page + pud_index(vaddr);
	pmd = fill_pmd(pud, vaddr);
	pte = fill_pte(pmd, vaddr);
L
Linus Torvalds 已提交
205 206 207 208 209 210 211 212 213 214

	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

215
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
{
	pgd_t *pgd;
	pud_t *pud_page;

	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
		return;
	}
	pud_page = (pud_t*)pgd_page_vaddr(*pgd);
	set_pte_vaddr_pud(pud_page, vaddr, pteval);
}

232
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
233 234 235 236 237
{
	pgd_t *pgd;
	pud_t *pud;

	pgd = pgd_offset_k(vaddr);
238 239 240 241 242 243 244
	pud = fill_pud(pgd, vaddr);
	return fill_pmd(pud, vaddr);
}

pte_t * __init populate_extra_pte(unsigned long vaddr)
{
	pmd_t *pmd;
245

246 247
	pmd = populate_extra_pmd(vaddr);
	return fill_pte(pmd, vaddr);
248 249
}

250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289
/*
 * Create large page table mappings for a range of physical addresses.
 */
static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
						pgprot_t prot)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
		pgd = pgd_offset_k((unsigned long)__va(phys));
		if (pgd_none(*pgd)) {
			pud = (pud_t *) spp_getpage();
			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pud = pud_offset(pgd, (unsigned long)__va(phys));
		if (pud_none(*pud)) {
			pmd = (pmd_t *) spp_getpage();
			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pmd = pmd_offset(pud, phys);
		BUG_ON(!pmd_none(*pmd));
		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
	}
}

void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
}

void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
}

290
/*
291 292 293
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
294 295 296 297 298
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
299 300
 * We limit the mappings to the region from _text to _brk_end.  _brk_end
 * is rounded up to the 2MB boundary. This catches the invalid pmds as
301 302 303 304 305
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
306 307
	unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
	unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
308 309
	pmd_t *pmd = level2_kernel_pgt;

310
	for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
311
		if (pmd_none(*pmd))
312 313 314 315 316 317
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

318
static __ref void *alloc_low_page(unsigned long *phys)
T
Thomas Gleixner 已提交
319
{
320
	unsigned long pfn = pgt_buf_end++;
L
Linus Torvalds 已提交
321 322
	void *adr;

323
	if (after_bootmem) {
324
		adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
325
		*phys = __pa(adr);
T
Thomas Gleixner 已提交
326

327 328 329
		return adr;
	}

330
	if (pfn >= pgt_buf_top)
T
Thomas Gleixner 已提交
331
		panic("alloc_low_page: ran out of memory");
332

J
Jeremy Fitzhardinge 已提交
333
	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
334
	clear_page(adr);
335 336 337
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
L
Linus Torvalds 已提交
338

Y
Yinghai Lu 已提交
339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354
static __ref void *map_low_page(void *virt)
{
	void *adr;
	unsigned long phys, left;

	if (after_bootmem)
		return virt;

	phys = __pa(virt);
	left = phys & (PAGE_SIZE - 1);
	adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
	adr = (void *)(((unsigned long)adr) | left);

	return adr;
}

355
static __ref void unmap_low_page(void *adr)
T
Thomas Gleixner 已提交
356
{
357 358 359
	if (after_bootmem)
		return;

Y
Yinghai Lu 已提交
360
	early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
T
Thomas Gleixner 已提交
361
}
L
Linus Torvalds 已提交
362

363
static unsigned long __meminit
364 365
phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
	      pgprot_t prot)
366 367
{
	unsigned pages = 0;
368
	unsigned long last_map_addr = end;
369
	int i;
370

371 372 373 374 375 376 377 378 379 380 381 382
	pte_t *pte = pte_page + pte_index(addr);

	for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {

		if (addr >= end) {
			if (!after_bootmem) {
				for(; i < PTRS_PER_PTE; i++, pte++)
					set_pte(pte, __pte(0));
			}
			break;
		}

383 384 385 386 387 388
		/*
		 * We will re-use the existing mapping.
		 * Xen for example has some special requirements, like mapping
		 * pagetable pages as RO. So assume someone who pre-setup
		 * these mappings are more intelligent.
		 */
389 390
		if (pte_val(*pte)) {
			pages++;
391
			continue;
392
		}
393 394 395 396 397

		if (0)
			printk("   pte=%p addr=%lx pte=%016lx\n",
			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
		pages++;
398
		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
399
		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
400
	}
401

402
	update_page_count(PG_LEVEL_4K, pages);
403 404

	return last_map_addr;
405 406
}

407
static unsigned long __meminit
408
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
409
	      unsigned long page_size_mask, pgprot_t prot)
410
{
411
	unsigned long pages = 0;
412
	unsigned long last_map_addr = end;
413

414
	int i = pmd_index(address);
415

416
	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
417
		unsigned long pte_phys;
418
		pmd_t *pmd = pmd_page + pmd_index(address);
419
		pte_t *pte;
420
		pgprot_t new_prot = prot;
421

422
		if (address >= end) {
T
Thomas Gleixner 已提交
423
			if (!after_bootmem) {
424 425
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
T
Thomas Gleixner 已提交
426
			}
427 428
			break;
		}
429

430
		if (pmd_val(*pmd)) {
431 432
			if (!pmd_large(*pmd)) {
				spin_lock(&init_mm.page_table_lock);
Y
Yinghai Lu 已提交
433 434
				pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
				last_map_addr = phys_pte_init(pte, address,
435
								end, prot);
Y
Yinghai Lu 已提交
436
				unmap_low_page(pte);
437
				spin_unlock(&init_mm.page_table_lock);
438
				continue;
439
			}
440 441 442 443 444 445 446 447 448 449 450 451
			/*
			 * If we are ok with PG_LEVEL_2M mapping, then we will
			 * use the existing mapping,
			 *
			 * Otherwise, we will split the large page mapping but
			 * use the same existing protection bits except for
			 * large page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
452 453
			if (page_size_mask & (1 << PG_LEVEL_2M)) {
				pages++;
454
				continue;
455
			}
456
			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
457 458
		}

459
		if (page_size_mask & (1<<PG_LEVEL_2M)) {
460
			pages++;
461
			spin_lock(&init_mm.page_table_lock);
462
			set_pte((pte_t *)pmd,
463 464
				pfn_pte(address >> PAGE_SHIFT,
					__pgprot(pgprot_val(prot) | _PAGE_PSE)));
465
			spin_unlock(&init_mm.page_table_lock);
466
			last_map_addr = (address & PMD_MASK) + PMD_SIZE;
467
			continue;
468
		}
469

470
		pte = alloc_low_page(&pte_phys);
471
		last_map_addr = phys_pte_init(pte, address, end, new_prot);
472 473
		unmap_low_page(pte);

474
		spin_lock(&init_mm.page_table_lock);
475
		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
476
		spin_unlock(&init_mm.page_table_lock);
477
	}
478
	update_page_count(PG_LEVEL_2M, pages);
479
	return last_map_addr;
480 481
}

482
static unsigned long __meminit
483 484
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
			 unsigned long page_size_mask)
T
Thomas Gleixner 已提交
485
{
486
	unsigned long pages = 0;
487
	unsigned long last_map_addr = end;
488
	int i = pud_index(addr);
489

T
Thomas Gleixner 已提交
490
	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
491 492
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
493
		pmd_t *pmd;
494
		pgprot_t prot = PAGE_KERNEL;
L
Linus Torvalds 已提交
495

496
		if (addr >= end)
L
Linus Torvalds 已提交
497 498
			break;

T
Thomas Gleixner 已提交
499 500 501
		if (!after_bootmem &&
				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
			set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
502
			continue;
T
Thomas Gleixner 已提交
503
		}
L
Linus Torvalds 已提交
504

505
		if (pud_val(*pud)) {
506
			if (!pud_large(*pud)) {
Y
Yinghai Lu 已提交
507 508
				pmd = map_low_page(pmd_offset(pud, 0));
				last_map_addr = phys_pmd_init(pmd, addr, end,
509
							 page_size_mask, prot);
Y
Yinghai Lu 已提交
510 511
				unmap_low_page(pmd);
				__flush_tlb_all();
512 513
				continue;
			}
514 515 516 517 518 519 520 521 522 523 524 525
			/*
			 * If we are ok with PG_LEVEL_1G mapping, then we will
			 * use the existing mapping.
			 *
			 * Otherwise, we will split the gbpage mapping but use
			 * the same existing protection  bits except for large
			 * page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
526 527
			if (page_size_mask & (1 << PG_LEVEL_1G)) {
				pages++;
528
				continue;
529
			}
530
			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
531 532
		}

533
		if (page_size_mask & (1<<PG_LEVEL_1G)) {
534
			pages++;
535
			spin_lock(&init_mm.page_table_lock);
536 537
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
538
			spin_unlock(&init_mm.page_table_lock);
539
			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
540 541 542
			continue;
		}

543
		pmd = alloc_low_page(&pmd_phys);
544 545
		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
					      prot);
546
		unmap_low_page(pmd);
547 548

		spin_lock(&init_mm.page_table_lock);
549
		pud_populate(&init_mm, pud, __va(pmd_phys));
550
		spin_unlock(&init_mm.page_table_lock);
L
Linus Torvalds 已提交
551
	}
A
Andi Kleen 已提交
552
	__flush_tlb_all();
553

554
	update_page_count(PG_LEVEL_1G, pages);
555

556
	return last_map_addr;
T
Thomas Gleixner 已提交
557
}
L
Linus Torvalds 已提交
558

559
unsigned long __meminit
560 561 562
kernel_physical_mapping_init(unsigned long start,
			     unsigned long end,
			     unsigned long page_size_mask)
T
Thomas Gleixner 已提交
563
{
564
	bool pgd_changed = false;
565
	unsigned long next, last_map_addr = end;
566
	unsigned long addr;
L
Linus Torvalds 已提交
567 568 569

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);
570
	addr = start;
L
Linus Torvalds 已提交
571 572

	for (; start < end; start = next) {
573
		pgd_t *pgd = pgd_offset_k(start);
T
Thomas Gleixner 已提交
574
		unsigned long pud_phys;
575 576
		pud_t *pud;

577
		next = (start + PGDIR_SIZE) & PGDIR_MASK;
578 579 580 581
		if (next > end)
			next = end;

		if (pgd_val(*pgd)) {
Y
Yinghai Lu 已提交
582 583
			pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
			last_map_addr = phys_pud_init(pud, __pa(start),
584
						 __pa(end), page_size_mask);
Y
Yinghai Lu 已提交
585
			unmap_low_page(pud);
586 587 588
			continue;
		}

589
		pud = alloc_low_page(&pud_phys);
590 591
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
						 page_size_mask);
592
		unmap_low_page(pud);
593 594 595 596

		spin_lock(&init_mm.page_table_lock);
		pgd_populate(&init_mm, pgd, __va(pud_phys));
		spin_unlock(&init_mm.page_table_lock);
597
		pgd_changed = true;
T
Thomas Gleixner 已提交
598
	}
599 600 601 602

	if (pgd_changed)
		sync_global_pgds(addr, end);

603
	__flush_tlb_all();
L
Linus Torvalds 已提交
604

605 606
	return last_map_addr;
}
607

608
#ifndef CONFIG_NUMA
609
void __init initmem_init(void)
610
{
611
	memblock_x86_register_active_regions(0, 0, max_pfn);
612
}
613
#endif
614

615
static void __init zone_sizes_init(void)
L
Linus Torvalds 已提交
616
{
617
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
618

619
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
620
#ifdef CONFIG_ZONE_DMA
621
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
622
#endif
623
#ifdef CONFIG_ZONE_DMA32
624
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
625
#endif
Y
Yinghai Lu 已提交
626
	max_zone_pfns[ZONE_NORMAL] = max_pfn;
627

628 629 630 631 632
	free_area_init_nodes(max_zone_pfns);
}

void __init paging_init(void)
{
633
	sparse_memory_present_with_active_regions(MAX_NUMNODES);
634
	sparse_init();
635 636 637 638 639 640 641 642 643

	/*
	 * clear the default setting with node 0
	 * note: don't use nodes_clear here, that is really clearing when
	 *	 numa support is not compiled in, and later node_set_state
	 *	 will not set it back.
	 */
	node_clear_state(0, N_NORMAL_MEMORY);

644
	zone_sizes_init();
L
Linus Torvalds 已提交
645 646
}

647 648 649
/*
 * Memory hotplug specific functions
 */
650
#ifdef CONFIG_MEMORY_HOTPLUG
651 652 653 654 655 656 657 658 659 660 661 662 663 664 665
/*
 * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
 * updating.
 */
static void  update_end_of_memory_vars(u64 start, u64 size)
{
	unsigned long end_pfn = PFN_UP(start + size);

	if (end_pfn > max_pfn) {
		max_pfn = end_pfn;
		max_low_pfn = end_pfn;
		high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
	}
}

666 667 668 669
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
670
int arch_add_memory(int nid, u64 start, u64 size)
671
{
672
	struct pglist_data *pgdat = NODE_DATA(nid);
673
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
674
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
675 676 677
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

678
	last_mapped_pfn = init_memory_mapping(start, start + size);
679 680
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
681

682
	ret = __add_pages(nid, zone, start_pfn, nr_pages);
683
	WARN_ON_ONCE(ret);
684

685 686 687
	/* update max_pfn, max_low_pfn and high_memory */
	update_end_of_memory_vars(start, size);

688 689
	return ret;
}
690
EXPORT_SYMBOL_GPL(arch_add_memory);
691

692 693
#endif /* CONFIG_MEMORY_HOTPLUG */

694
static struct kcore_list kcore_vsyscall;
L
Linus Torvalds 已提交
695 696 697

void __init mem_init(void)
{
698
	long codesize, reservedpages, datasize, initsize;
699
	unsigned long absent_pages;
L
Linus Torvalds 已提交
700

701
	pci_iommu_alloc();
L
Linus Torvalds 已提交
702

703
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
704 705 706 707

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
708
#ifdef CONFIG_NUMA
709
	totalram_pages = numa_free_all_bootmem();
L
Linus Torvalds 已提交
710
#else
711
	totalram_pages = free_all_bootmem();
L
Linus Torvalds 已提交
712
#endif
713 714 715

	absent_pages = absent_pages_in_range(0, max_pfn);
	reservedpages = max_pfn - totalram_pages - absent_pages;
L
Linus Torvalds 已提交
716 717 718 719 720 721 722
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
723
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
K
KAMEZAWA Hiroyuki 已提交
724
			 VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
L
Linus Torvalds 已提交
725

726
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
727
			 "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
728
		nr_free_pages() << (PAGE_SHIFT-10),
Y
Yinghai Lu 已提交
729
		max_pfn << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
730
		codesize >> 10,
731
		absent_pages << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
732 733 734 735 736
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
}

737
#ifdef CONFIG_DEBUG_RODATA
738 739
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
740

741
int kernel_set_to_readonly;
742 743 744

void set_kernel_text_rw(void)
{
745
	unsigned long start = PFN_ALIGN(_text);
746
	unsigned long end = PFN_ALIGN(__stop___ex_table);
747 748 749 750 751 752 753

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read write\n",
		 start, end);

754 755 756 757 758
	/*
	 * Make the kernel identity mapping for text RW. Kernel text
	 * mapping will always be RO. Refer to the comment in
	 * static_protections() in pageattr.c
	 */
759 760 761 762 763
	set_memory_rw(start, (end - start) >> PAGE_SHIFT);
}

void set_kernel_text_ro(void)
{
764
	unsigned long start = PFN_ALIGN(_text);
765
	unsigned long end = PFN_ALIGN(__stop___ex_table);
766 767 768 769 770 771 772

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read only\n",
		 start, end);

773 774 775
	/*
	 * Set the kernel identity mapping for text RO.
	 */
776 777 778
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
}

779 780
void mark_rodata_ro(void)
{
781
	unsigned long start = PFN_ALIGN(_text);
782 783
	unsigned long rodata_start =
		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
784 785 786 787
	unsigned long end = (unsigned long) &__end_rodata_hpage_align;
	unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
	unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
	unsigned long data_start = (unsigned long) &_sdata;
788

789
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
790
	       (end - start) >> 10);
791 792
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

793 794
	kernel_set_to_readonly = 1;

795 796 797 798
	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
799
	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
800

801 802
	rodata_test();

803
#ifdef CONFIG_CPA_DEBUG
804
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
805
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
806

807
	printk(KERN_INFO "Testing CPA: again\n");
808
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
809
#endif
810 811 812 813 814 815 816 817

	free_init_pages("unused kernel memory",
			(unsigned long) page_address(virt_to_page(text_end)),
			(unsigned long)
				 page_address(virt_to_page(rodata_start)));
	free_init_pages("unused kernel memory",
			(unsigned long) page_address(virt_to_page(rodata_end)),
			(unsigned long) page_address(virt_to_page(data_start)));
818
}
819

820 821
#endif

T
Thomas Gleixner 已提交
822 823
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
824
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
825 826 827 828
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
829 830

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
831 832
		return 0;

L
Linus Torvalds 已提交
833 834 835 836 837 838
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
839
		return 0;
L
Linus Torvalds 已提交
840 841 842 843

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
844

L
Linus Torvalds 已提交
845 846 847 848 849 850
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
851

L
Linus Torvalds 已提交
852 853 854
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
855 856 857 858 859
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
860
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
861 862 863 864
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
865 866
};

867
struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
L
Linus Torvalds 已提交
868 869
{
#ifdef CONFIG_IA32_EMULATION
870
	if (!mm || mm->context.ia32_compat)
871
		return NULL;
L
Linus Torvalds 已提交
872 873 874 875
#endif
	return &gate_vma;
}

876
int in_gate_area(struct mm_struct *mm, unsigned long addr)
L
Linus Torvalds 已提交
877
{
878
	struct vm_area_struct *vma = get_gate_vma(mm);
T
Thomas Gleixner 已提交
879

880 881
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
882

L
Linus Torvalds 已提交
883 884 885
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
886
/*
887 888 889
 * Use this when you have no reliable mm, typically from interrupt
 * context. It is less reliable than using a task's mm and may give
 * false positives.
L
Linus Torvalds 已提交
890
 */
891
int in_gate_area_no_mm(unsigned long addr)
L
Linus Torvalds 已提交
892
{
893
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
894
}
895

896 897 898 899 900 901 902 903
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
904

905 906 907 908 909 910 911 912 913 914 915
#ifdef CONFIG_X86_UV
unsigned long memory_block_size_bytes(void)
{
	if (is_uv_system()) {
		printk(KERN_INFO "UV: memory block size 2GB\n");
		return 2UL * 1024 * 1024 * 1024;
	}
	return MIN_MEMORY_BLOCK_SIZE;
}
#endif

916 917 918 919
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
920 921 922 923
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

T
Thomas Gleixner 已提交
924 925
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
926 927 928 929 930 931 932 933 934
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
935
		void *p = NULL;
936 937 938 939

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
940

941 942 943 944
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

945 946 947 948 949 950 951 952
		if (!cpu_has_pse) {
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			pmd = vmemmap_pmd_populate(pud, addr, node);

			if (!pmd)
				return -ENOMEM;

			p = vmemmap_pte_populate(pmd, addr, node);
T
Thomas Gleixner 已提交
953

954 955 956
			if (!p)
				return -ENOMEM;

957 958
			addr_end = addr + PAGE_SIZE;
			p_end = p + PAGE_SIZE;
T
Thomas Gleixner 已提交
959
		} else {
960 961 962 963 964 965
			next = pmd_addr_end(addr, end);

			pmd = pmd_offset(pud, addr);
			if (pmd_none(*pmd)) {
				pte_t entry;

966
				p = vmemmap_alloc_block_buf(PMD_SIZE, node);
967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982
				if (!p)
					return -ENOMEM;

				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
						PAGE_KERNEL_LARGE);
				set_pmd(pmd, __pmd(pte_val(entry)));

				/* check to see if we have contiguous blocks */
				if (p_end != p || node_start != node) {
					if (p_start)
						printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						       addr_start, addr_end-1, p_start, p_end-1, node_start);
					addr_start = addr;
					node_start = node;
					p_start = p;
				}
Y
Yinghai Lu 已提交
983 984 985

				addr_end = addr + PMD_SIZE;
				p_end = p + PMD_SIZE;
986 987
			} else
				vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
988
		}
989

990
	}
991
	sync_global_pgds((unsigned long)start_page, end);
992 993
	return 0;
}
994 995 996 997 998 999 1000 1001 1002 1003 1004

void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
1005
#endif