init_64.c 24.0 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
P
Pavel Machek 已提交
5
 *  Copyright (C) 2000  Pavel Machek <pavel@ucw.cz>
L
Linus Torvalds 已提交
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
T
Thomas Gleixner 已提交
21
#include <linux/initrd.h>
L
Linus Torvalds 已提交
22 23
#include <linux/pagemap.h>
#include <linux/bootmem.h>
24
#include <linux/memblock.h>
L
Linus Torvalds 已提交
25
#include <linux/proc_fs.h>
26
#include <linux/pci.h>
27
#include <linux/pfn.h>
28
#include <linux/poison.h>
29
#include <linux/dma-mapping.h>
30
#include <linux/module.h>
31
#include <linux/memory.h>
32
#include <linux/memory_hotplug.h>
33
#include <linux/nmi.h>
34
#include <linux/gfp.h>
L
Linus Torvalds 已提交
35 36

#include <asm/processor.h>
37
#include <asm/bios_ebda.h>
L
Linus Torvalds 已提交
38 39 40 41 42 43 44 45 46 47 48 49
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
50
#include <asm/sections.h>
51
#include <asm/kdebug.h>
52
#include <asm/numa.h>
53
#include <asm/cacheflush.h>
54
#include <asm/init.h>
55
#include <asm/uv/uv.h>
56
#include <asm/setup.h>
L
Linus Torvalds 已提交
57

I
Ingo Molnar 已提交
58 59 60 61 62 63 64 65 66 67 68 69 70 71
static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
72 73 74 75 76 77
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

78
pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
79 80 81 82
EXPORT_SYMBOL_GPL(__supported_pte_mask);

int force_personality32;

I
Ingo Molnar 已提交
83 84 85 86 87 88 89 90
/*
 * noexec32=on|off
 * Control non executable heap for 32bit processes.
 * To control the stack too use noexec=off
 *
 * on	PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
 * off	PROT_READ implies PROT_EXEC
 */
91 92 93 94 95 96 97 98 99 100
static int __init nonx32_setup(char *str)
{
	if (!strcmp(str, "on"))
		force_personality32 &= ~READ_IMPLIES_EXEC;
	else if (!strcmp(str, "off"))
		force_personality32 |= READ_IMPLIES_EXEC;
	return 1;
}
__setup("noexec32=", nonx32_setup);

101 102 103 104 105 106
/*
 * When memory was added/removed make sure all the processes MM have
 * suitable PGD entries in the local PGD level page.
 */
void sync_global_pgds(unsigned long start, unsigned long end)
{
107 108 109 110 111 112 113 114 115
	unsigned long address;

	for (address = start; address <= end; address += PGDIR_SIZE) {
		const pgd_t *pgd_ref = pgd_offset_k(address);
		struct page *page;

		if (pgd_none(*pgd_ref))
			continue;

A
Andrea Arcangeli 已提交
116
		spin_lock(&pgd_lock);
117 118
		list_for_each_entry(page, &pgd_list, lru) {
			pgd_t *pgd;
119 120
			spinlock_t *pgt_lock;

121
			pgd = (pgd_t *)page_address(page) + pgd_index(address);
A
Andrea Arcangeli 已提交
122
			/* the pgt_lock only for Xen */
123 124 125
			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
			spin_lock(pgt_lock);

126 127 128 129 130
			if (pgd_none(*pgd))
				set_pgd(pgd, *pgd_ref);
			else
				BUG_ON(pgd_page_vaddr(*pgd)
				       != pgd_page_vaddr(*pgd_ref));
131 132

			spin_unlock(pgt_lock);
133
		}
A
Andrea Arcangeli 已提交
134
		spin_unlock(&pgd_lock);
135
	}
136 137
}

138 139 140 141 142
/*
 * NOTE: This function is marked __ref because it calls __init function
 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
 */
static __ref void *spp_getpage(void)
T
Thomas Gleixner 已提交
143
{
L
Linus Torvalds 已提交
144
	void *ptr;
T
Thomas Gleixner 已提交
145

L
Linus Torvalds 已提交
146
	if (after_bootmem)
147
		ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
L
Linus Torvalds 已提交
148 149
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
150 151 152 153 154

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
155

156
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
157

L
Linus Torvalds 已提交
158
	return ptr;
T
Thomas Gleixner 已提交
159
}
L
Linus Torvalds 已提交
160

161
static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
L
Linus Torvalds 已提交
162
{
163 164 165 166 167 168 169 170 171
	if (pgd_none(*pgd)) {
		pud_t *pud = (pud_t *)spp_getpage();
		pgd_populate(&init_mm, pgd, pud);
		if (pud != pud_offset(pgd, 0))
			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
			       pud, pud_offset(pgd, 0));
	}
	return pud_offset(pgd, vaddr);
}
L
Linus Torvalds 已提交
172

173
static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
174
{
L
Linus Torvalds 已提交
175
	if (pud_none(*pud)) {
176
		pmd_t *pmd = (pmd_t *) spp_getpage();
177
		pud_populate(&init_mm, pud, pmd);
178
		if (pmd != pmd_offset(pud, 0))
179
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
180
			       pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
181
	}
182 183 184
	return pmd_offset(pud, vaddr);
}

185
static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
186
{
L
Linus Torvalds 已提交
187
	if (pmd_none(*pmd)) {
188
		pte_t *pte = (pte_t *) spp_getpage();
189
		pmd_populate_kernel(&init_mm, pmd, pte);
190
		if (pte != pte_offset_kernel(pmd, 0))
191
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
192
	}
193 194 195 196 197 198 199 200 201 202 203 204
	return pte_offset_kernel(pmd, vaddr);
}

void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
{
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	pud = pud_page + pud_index(vaddr);
	pmd = fill_pmd(pud, vaddr);
	pte = fill_pte(pmd, vaddr);
L
Linus Torvalds 已提交
205 206 207 208 209 210 211 212 213 214

	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

215
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
{
	pgd_t *pgd;
	pud_t *pud_page;

	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
		return;
	}
	pud_page = (pud_t*)pgd_page_vaddr(*pgd);
	set_pte_vaddr_pud(pud_page, vaddr, pteval);
}

232
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
233 234 235 236 237
{
	pgd_t *pgd;
	pud_t *pud;

	pgd = pgd_offset_k(vaddr);
238 239 240 241 242 243 244
	pud = fill_pud(pgd, vaddr);
	return fill_pmd(pud, vaddr);
}

pte_t * __init populate_extra_pte(unsigned long vaddr)
{
	pmd_t *pmd;
245

246 247
	pmd = populate_extra_pmd(vaddr);
	return fill_pte(pmd, vaddr);
248 249
}

250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289
/*
 * Create large page table mappings for a range of physical addresses.
 */
static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
						pgprot_t prot)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
		pgd = pgd_offset_k((unsigned long)__va(phys));
		if (pgd_none(*pgd)) {
			pud = (pud_t *) spp_getpage();
			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pud = pud_offset(pgd, (unsigned long)__va(phys));
		if (pud_none(*pud)) {
			pmd = (pmd_t *) spp_getpage();
			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pmd = pmd_offset(pud, phys);
		BUG_ON(!pmd_none(*pmd));
		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
	}
}

void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
}

void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
}

290
/*
291 292 293
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
294 295 296 297 298
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
299 300
 * We limit the mappings to the region from _text to _brk_end.  _brk_end
 * is rounded up to the 2MB boundary. This catches the invalid pmds as
301 302 303 304 305
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
306 307
	unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
	unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
308 309
	pmd_t *pmd = level2_kernel_pgt;

310
	for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
311
		if (pmd_none(*pmd))
312 313 314 315 316 317
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

318
static __ref void *alloc_low_page(unsigned long *phys)
T
Thomas Gleixner 已提交
319
{
320
	unsigned long pfn = pgt_buf_end++;
L
Linus Torvalds 已提交
321 322
	void *adr;

323
	if (after_bootmem) {
324
		adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
325
		*phys = __pa(adr);
T
Thomas Gleixner 已提交
326

327 328 329
		return adr;
	}

330
	if (pfn >= pgt_buf_top)
T
Thomas Gleixner 已提交
331
		panic("alloc_low_page: ran out of memory");
332

J
Jeremy Fitzhardinge 已提交
333
	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
334
	clear_page(adr);
335 336 337
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
L
Linus Torvalds 已提交
338

Y
Yinghai Lu 已提交
339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354
static __ref void *map_low_page(void *virt)
{
	void *adr;
	unsigned long phys, left;

	if (after_bootmem)
		return virt;

	phys = __pa(virt);
	left = phys & (PAGE_SIZE - 1);
	adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
	adr = (void *)(((unsigned long)adr) | left);

	return adr;
}

355
static __ref void unmap_low_page(void *adr)
T
Thomas Gleixner 已提交
356
{
357 358 359
	if (after_bootmem)
		return;

Y
Yinghai Lu 已提交
360
	early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
T
Thomas Gleixner 已提交
361
}
L
Linus Torvalds 已提交
362

363
static unsigned long __meminit
364 365
phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
	      pgprot_t prot)
366 367
{
	unsigned pages = 0;
368
	unsigned long last_map_addr = end;
369
	int i;
370

371 372 373 374 375 376 377 378 379 380 381 382
	pte_t *pte = pte_page + pte_index(addr);

	for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {

		if (addr >= end) {
			if (!after_bootmem) {
				for(; i < PTRS_PER_PTE; i++, pte++)
					set_pte(pte, __pte(0));
			}
			break;
		}

383 384 385 386 387 388
		/*
		 * We will re-use the existing mapping.
		 * Xen for example has some special requirements, like mapping
		 * pagetable pages as RO. So assume someone who pre-setup
		 * these mappings are more intelligent.
		 */
389 390
		if (pte_val(*pte)) {
			pages++;
391
			continue;
392
		}
393 394 395 396 397

		if (0)
			printk("   pte=%p addr=%lx pte=%016lx\n",
			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
		pages++;
398
		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
399
		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
400
	}
401

402
	update_page_count(PG_LEVEL_4K, pages);
403 404

	return last_map_addr;
405 406
}

407
static unsigned long __meminit
408
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
409
	      unsigned long page_size_mask, pgprot_t prot)
410
{
411
	unsigned long pages = 0;
412
	unsigned long last_map_addr = end;
413

414
	int i = pmd_index(address);
415

416
	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
417
		unsigned long pte_phys;
418
		pmd_t *pmd = pmd_page + pmd_index(address);
419
		pte_t *pte;
420
		pgprot_t new_prot = prot;
421

422
		if (address >= end) {
T
Thomas Gleixner 已提交
423
			if (!after_bootmem) {
424 425
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
T
Thomas Gleixner 已提交
426
			}
427 428
			break;
		}
429

430
		if (pmd_val(*pmd)) {
431 432
			if (!pmd_large(*pmd)) {
				spin_lock(&init_mm.page_table_lock);
Y
Yinghai Lu 已提交
433 434
				pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
				last_map_addr = phys_pte_init(pte, address,
435
								end, prot);
Y
Yinghai Lu 已提交
436
				unmap_low_page(pte);
437
				spin_unlock(&init_mm.page_table_lock);
438
				continue;
439
			}
440 441 442 443 444 445 446 447 448 449 450 451
			/*
			 * If we are ok with PG_LEVEL_2M mapping, then we will
			 * use the existing mapping,
			 *
			 * Otherwise, we will split the large page mapping but
			 * use the same existing protection bits except for
			 * large page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
452 453
			if (page_size_mask & (1 << PG_LEVEL_2M)) {
				pages++;
454
				continue;
455
			}
456
			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
457 458
		}

459
		if (page_size_mask & (1<<PG_LEVEL_2M)) {
460
			pages++;
461
			spin_lock(&init_mm.page_table_lock);
462
			set_pte((pte_t *)pmd,
463 464
				pfn_pte(address >> PAGE_SHIFT,
					__pgprot(pgprot_val(prot) | _PAGE_PSE)));
465
			spin_unlock(&init_mm.page_table_lock);
466
			last_map_addr = (address & PMD_MASK) + PMD_SIZE;
467
			continue;
468
		}
469

470
		pte = alloc_low_page(&pte_phys);
471
		last_map_addr = phys_pte_init(pte, address, end, new_prot);
472 473
		unmap_low_page(pte);

474
		spin_lock(&init_mm.page_table_lock);
475
		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
476
		spin_unlock(&init_mm.page_table_lock);
477
	}
478
	update_page_count(PG_LEVEL_2M, pages);
479
	return last_map_addr;
480 481
}

482
static unsigned long __meminit
483 484
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
			 unsigned long page_size_mask)
T
Thomas Gleixner 已提交
485
{
486
	unsigned long pages = 0;
487
	unsigned long last_map_addr = end;
488
	int i = pud_index(addr);
489

T
Thomas Gleixner 已提交
490
	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
491 492
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
493
		pmd_t *pmd;
494
		pgprot_t prot = PAGE_KERNEL;
L
Linus Torvalds 已提交
495

496
		if (addr >= end)
L
Linus Torvalds 已提交
497 498
			break;

T
Thomas Gleixner 已提交
499 500 501
		if (!after_bootmem &&
				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
			set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
502
			continue;
T
Thomas Gleixner 已提交
503
		}
L
Linus Torvalds 已提交
504

505
		if (pud_val(*pud)) {
506
			if (!pud_large(*pud)) {
Y
Yinghai Lu 已提交
507 508
				pmd = map_low_page(pmd_offset(pud, 0));
				last_map_addr = phys_pmd_init(pmd, addr, end,
509
							 page_size_mask, prot);
Y
Yinghai Lu 已提交
510 511
				unmap_low_page(pmd);
				__flush_tlb_all();
512 513
				continue;
			}
514 515 516 517 518 519 520 521 522 523 524 525
			/*
			 * If we are ok with PG_LEVEL_1G mapping, then we will
			 * use the existing mapping.
			 *
			 * Otherwise, we will split the gbpage mapping but use
			 * the same existing protection  bits except for large
			 * page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
526 527
			if (page_size_mask & (1 << PG_LEVEL_1G)) {
				pages++;
528
				continue;
529
			}
530
			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
531 532
		}

533
		if (page_size_mask & (1<<PG_LEVEL_1G)) {
534
			pages++;
535
			spin_lock(&init_mm.page_table_lock);
536 537
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
538
			spin_unlock(&init_mm.page_table_lock);
539
			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
540 541 542
			continue;
		}

543
		pmd = alloc_low_page(&pmd_phys);
544 545
		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
					      prot);
546
		unmap_low_page(pmd);
547 548

		spin_lock(&init_mm.page_table_lock);
549
		pud_populate(&init_mm, pud, __va(pmd_phys));
550
		spin_unlock(&init_mm.page_table_lock);
L
Linus Torvalds 已提交
551
	}
A
Andi Kleen 已提交
552
	__flush_tlb_all();
553

554
	update_page_count(PG_LEVEL_1G, pages);
555

556
	return last_map_addr;
T
Thomas Gleixner 已提交
557
}
L
Linus Torvalds 已提交
558

559
unsigned long __meminit
560 561 562
kernel_physical_mapping_init(unsigned long start,
			     unsigned long end,
			     unsigned long page_size_mask)
T
Thomas Gleixner 已提交
563
{
564
	bool pgd_changed = false;
565
	unsigned long next, last_map_addr = end;
566
	unsigned long addr;
L
Linus Torvalds 已提交
567 568 569

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);
570
	addr = start;
L
Linus Torvalds 已提交
571 572

	for (; start < end; start = next) {
573
		pgd_t *pgd = pgd_offset_k(start);
T
Thomas Gleixner 已提交
574
		unsigned long pud_phys;
575 576
		pud_t *pud;

577
		next = (start + PGDIR_SIZE) & PGDIR_MASK;
578 579 580 581
		if (next > end)
			next = end;

		if (pgd_val(*pgd)) {
Y
Yinghai Lu 已提交
582 583
			pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
			last_map_addr = phys_pud_init(pud, __pa(start),
584
						 __pa(end), page_size_mask);
Y
Yinghai Lu 已提交
585
			unmap_low_page(pud);
586 587 588
			continue;
		}

589
		pud = alloc_low_page(&pud_phys);
590 591
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
						 page_size_mask);
592
		unmap_low_page(pud);
593 594 595 596

		spin_lock(&init_mm.page_table_lock);
		pgd_populate(&init_mm, pgd, __va(pud_phys));
		spin_unlock(&init_mm.page_table_lock);
597
		pgd_changed = true;
T
Thomas Gleixner 已提交
598
	}
599 600 601 602

	if (pgd_changed)
		sync_global_pgds(addr, end);

603
	__flush_tlb_all();
L
Linus Torvalds 已提交
604

605 606
	return last_map_addr;
}
607

608
#ifndef CONFIG_NUMA
609
void __init initmem_init(void)
610
{
611
	memblock_x86_register_active_regions(0, 0, max_pfn);
612
}
613
#endif
614

615
static void __init zone_sizes_init(void)
L
Linus Torvalds 已提交
616
{
617
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
618

619
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
620
#ifdef CONFIG_ZONE_DMA
621
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
622
#endif
623
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
Y
Yinghai Lu 已提交
624
	max_zone_pfns[ZONE_NORMAL] = max_pfn;
625

626 627 628 629 630
	free_area_init_nodes(max_zone_pfns);
}

void __init paging_init(void)
{
631
	sparse_memory_present_with_active_regions(MAX_NUMNODES);
632
	sparse_init();
633 634 635 636 637 638 639 640 641

	/*
	 * clear the default setting with node 0
	 * note: don't use nodes_clear here, that is really clearing when
	 *	 numa support is not compiled in, and later node_set_state
	 *	 will not set it back.
	 */
	node_clear_state(0, N_NORMAL_MEMORY);

642
	zone_sizes_init();
L
Linus Torvalds 已提交
643 644
}

645 646 647
/*
 * Memory hotplug specific functions
 */
648
#ifdef CONFIG_MEMORY_HOTPLUG
649 650 651 652 653 654 655 656 657 658 659 660 661 662 663
/*
 * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
 * updating.
 */
static void  update_end_of_memory_vars(u64 start, u64 size)
{
	unsigned long end_pfn = PFN_UP(start + size);

	if (end_pfn > max_pfn) {
		max_pfn = end_pfn;
		max_low_pfn = end_pfn;
		high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
	}
}

664 665 666 667
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
668
int arch_add_memory(int nid, u64 start, u64 size)
669
{
670
	struct pglist_data *pgdat = NODE_DATA(nid);
671
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
672
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
673 674 675
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

676
	last_mapped_pfn = init_memory_mapping(start, start + size);
677 678
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
679

680
	ret = __add_pages(nid, zone, start_pfn, nr_pages);
681
	WARN_ON_ONCE(ret);
682

683 684 685
	/* update max_pfn, max_low_pfn and high_memory */
	update_end_of_memory_vars(start, size);

686 687
	return ret;
}
688
EXPORT_SYMBOL_GPL(arch_add_memory);
689

690 691
#endif /* CONFIG_MEMORY_HOTPLUG */

692
static struct kcore_list kcore_vsyscall;
L
Linus Torvalds 已提交
693 694 695

void __init mem_init(void)
{
696
	long codesize, reservedpages, datasize, initsize;
697
	unsigned long absent_pages;
L
Linus Torvalds 已提交
698

699
	pci_iommu_alloc();
L
Linus Torvalds 已提交
700

701
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
702 703 704 705

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
706
#ifdef CONFIG_NUMA
707
	totalram_pages = numa_free_all_bootmem();
L
Linus Torvalds 已提交
708
#else
709
	totalram_pages = free_all_bootmem();
L
Linus Torvalds 已提交
710
#endif
711 712 713

	absent_pages = absent_pages_in_range(0, max_pfn);
	reservedpages = max_pfn - totalram_pages - absent_pages;
L
Linus Torvalds 已提交
714 715 716 717 718 719 720
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
721
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
K
KAMEZAWA Hiroyuki 已提交
722
			 VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
L
Linus Torvalds 已提交
723

724
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
725
			 "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
726
		nr_free_pages() << (PAGE_SHIFT-10),
Y
Yinghai Lu 已提交
727
		max_pfn << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
728
		codesize >> 10,
729
		absent_pages << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
730 731 732 733 734
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
}

735
#ifdef CONFIG_DEBUG_RODATA
736 737
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
738

739
int kernel_set_to_readonly;
740 741 742

void set_kernel_text_rw(void)
{
743
	unsigned long start = PFN_ALIGN(_text);
744
	unsigned long end = PFN_ALIGN(__stop___ex_table);
745 746 747 748 749 750 751

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read write\n",
		 start, end);

752 753 754 755 756
	/*
	 * Make the kernel identity mapping for text RW. Kernel text
	 * mapping will always be RO. Refer to the comment in
	 * static_protections() in pageattr.c
	 */
757 758 759 760 761
	set_memory_rw(start, (end - start) >> PAGE_SHIFT);
}

void set_kernel_text_ro(void)
{
762
	unsigned long start = PFN_ALIGN(_text);
763
	unsigned long end = PFN_ALIGN(__stop___ex_table);
764 765 766 767 768 769 770

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read only\n",
		 start, end);

771 772 773
	/*
	 * Set the kernel identity mapping for text RO.
	 */
774 775 776
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
}

777 778
void mark_rodata_ro(void)
{
779
	unsigned long start = PFN_ALIGN(_text);
780 781
	unsigned long rodata_start =
		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
782 783 784 785
	unsigned long end = (unsigned long) &__end_rodata_hpage_align;
	unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
	unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
	unsigned long data_start = (unsigned long) &_sdata;
786

787
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
788
	       (end - start) >> 10);
789 790
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

791 792
	kernel_set_to_readonly = 1;

793 794 795 796
	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
797
	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
798

799 800
	rodata_test();

801
#ifdef CONFIG_CPA_DEBUG
802
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
803
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
804

805
	printk(KERN_INFO "Testing CPA: again\n");
806
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
807
#endif
808 809 810 811 812 813 814 815

	free_init_pages("unused kernel memory",
			(unsigned long) page_address(virt_to_page(text_end)),
			(unsigned long)
				 page_address(virt_to_page(rodata_start)));
	free_init_pages("unused kernel memory",
			(unsigned long) page_address(virt_to_page(rodata_end)),
			(unsigned long) page_address(virt_to_page(data_start)));
816
}
817

818 819
#endif

T
Thomas Gleixner 已提交
820 821
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
822
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
823 824 825 826
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
827 828

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
829 830
		return 0;

L
Linus Torvalds 已提交
831 832 833 834 835 836
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
837
		return 0;
L
Linus Torvalds 已提交
838 839 840 841

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
842

L
Linus Torvalds 已提交
843 844 845 846 847 848
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
849

L
Linus Torvalds 已提交
850 851 852
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
853 854 855 856 857
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
858
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
859 860 861 862
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
863 864
};

865
struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
L
Linus Torvalds 已提交
866 867
{
#ifdef CONFIG_IA32_EMULATION
868
	if (!mm || mm->context.ia32_compat)
869
		return NULL;
L
Linus Torvalds 已提交
870 871 872 873
#endif
	return &gate_vma;
}

874
int in_gate_area(struct mm_struct *mm, unsigned long addr)
L
Linus Torvalds 已提交
875
{
876
	struct vm_area_struct *vma = get_gate_vma(mm);
T
Thomas Gleixner 已提交
877

878 879
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
880

L
Linus Torvalds 已提交
881 882 883
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
884
/*
885 886 887
 * Use this when you have no reliable mm, typically from interrupt
 * context. It is less reliable than using a task's mm and may give
 * false positives.
L
Linus Torvalds 已提交
888
 */
889
int in_gate_area_no_mm(unsigned long addr)
L
Linus Torvalds 已提交
890
{
891
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
892
}
893

894 895 896 897 898 899 900 901
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
902

903 904 905 906 907 908 909 910 911 912 913
#ifdef CONFIG_X86_UV
unsigned long memory_block_size_bytes(void)
{
	if (is_uv_system()) {
		printk(KERN_INFO "UV: memory block size 2GB\n");
		return 2UL * 1024 * 1024 * 1024;
	}
	return MIN_MEMORY_BLOCK_SIZE;
}
#endif

914 915 916 917
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
918 919 920 921
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

T
Thomas Gleixner 已提交
922 923
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
924 925 926 927 928 929 930 931 932
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
933
		void *p = NULL;
934 935 936 937

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
938

939 940 941 942
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

943 944 945 946 947 948 949 950
		if (!cpu_has_pse) {
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			pmd = vmemmap_pmd_populate(pud, addr, node);

			if (!pmd)
				return -ENOMEM;

			p = vmemmap_pte_populate(pmd, addr, node);
T
Thomas Gleixner 已提交
951

952 953 954
			if (!p)
				return -ENOMEM;

955 956
			addr_end = addr + PAGE_SIZE;
			p_end = p + PAGE_SIZE;
T
Thomas Gleixner 已提交
957
		} else {
958 959 960 961 962 963
			next = pmd_addr_end(addr, end);

			pmd = pmd_offset(pud, addr);
			if (pmd_none(*pmd)) {
				pte_t entry;

964
				p = vmemmap_alloc_block_buf(PMD_SIZE, node);
965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980
				if (!p)
					return -ENOMEM;

				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
						PAGE_KERNEL_LARGE);
				set_pmd(pmd, __pmd(pte_val(entry)));

				/* check to see if we have contiguous blocks */
				if (p_end != p || node_start != node) {
					if (p_start)
						printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						       addr_start, addr_end-1, p_start, p_end-1, node_start);
					addr_start = addr;
					node_start = node;
					p_start = p;
				}
Y
Yinghai Lu 已提交
981 982 983

				addr_end = addr + PMD_SIZE;
				p_end = p + PMD_SIZE;
984 985
			} else
				vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
986
		}
987

988
	}
989
	sync_global_pgds((unsigned long)start_page, end);
990 991
	return 0;
}
992 993 994 995 996 997 998 999 1000 1001 1002

void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
1003
#endif