init_64.c 23.7 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
P
Pavel Machek 已提交
5
 *  Copyright (C) 2000  Pavel Machek <pavel@ucw.cz>
L
Linus Torvalds 已提交
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
T
Thomas Gleixner 已提交
21
#include <linux/initrd.h>
L
Linus Torvalds 已提交
22 23
#include <linux/pagemap.h>
#include <linux/bootmem.h>
24
#include <linux/memblock.h>
L
Linus Torvalds 已提交
25
#include <linux/proc_fs.h>
26
#include <linux/pci.h>
27
#include <linux/pfn.h>
28
#include <linux/poison.h>
29
#include <linux/dma-mapping.h>
30
#include <linux/module.h>
31
#include <linux/memory.h>
32
#include <linux/memory_hotplug.h>
33
#include <linux/nmi.h>
34
#include <linux/gfp.h>
L
Linus Torvalds 已提交
35 36

#include <asm/processor.h>
37
#include <asm/bios_ebda.h>
L
Linus Torvalds 已提交
38 39 40 41 42 43 44 45 46 47 48 49
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
50
#include <asm/sections.h>
51
#include <asm/kdebug.h>
52
#include <asm/numa.h>
53
#include <asm/cacheflush.h>
54
#include <asm/init.h>
55
#include <asm/uv/uv.h>
56
#include <asm/setup.h>
L
Linus Torvalds 已提交
57

I
Ingo Molnar 已提交
58 59 60 61 62 63 64 65 66 67 68 69 70 71
static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
72 73 74 75 76 77
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

78
pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
79 80 81 82
EXPORT_SYMBOL_GPL(__supported_pte_mask);

int force_personality32;

I
Ingo Molnar 已提交
83 84 85 86 87 88 89 90
/*
 * noexec32=on|off
 * Control non executable heap for 32bit processes.
 * To control the stack too use noexec=off
 *
 * on	PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
 * off	PROT_READ implies PROT_EXEC
 */
91 92 93 94 95 96 97 98 99 100
static int __init nonx32_setup(char *str)
{
	if (!strcmp(str, "on"))
		force_personality32 &= ~READ_IMPLIES_EXEC;
	else if (!strcmp(str, "off"))
		force_personality32 |= READ_IMPLIES_EXEC;
	return 1;
}
__setup("noexec32=", nonx32_setup);

101 102 103 104 105 106
/*
 * When memory was added/removed make sure all the processes MM have
 * suitable PGD entries in the local PGD level page.
 */
void sync_global_pgds(unsigned long start, unsigned long end)
{
107 108 109 110 111 112 113 114 115
	unsigned long address;

	for (address = start; address <= end; address += PGDIR_SIZE) {
		const pgd_t *pgd_ref = pgd_offset_k(address);
		struct page *page;

		if (pgd_none(*pgd_ref))
			continue;

A
Andrea Arcangeli 已提交
116
		spin_lock(&pgd_lock);
117 118
		list_for_each_entry(page, &pgd_list, lru) {
			pgd_t *pgd;
119 120
			spinlock_t *pgt_lock;

121
			pgd = (pgd_t *)page_address(page) + pgd_index(address);
A
Andrea Arcangeli 已提交
122
			/* the pgt_lock only for Xen */
123 124 125
			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
			spin_lock(pgt_lock);

126 127 128 129 130
			if (pgd_none(*pgd))
				set_pgd(pgd, *pgd_ref);
			else
				BUG_ON(pgd_page_vaddr(*pgd)
				       != pgd_page_vaddr(*pgd_ref));
131 132

			spin_unlock(pgt_lock);
133
		}
A
Andrea Arcangeli 已提交
134
		spin_unlock(&pgd_lock);
135
	}
136 137
}

138 139 140 141 142
/*
 * NOTE: This function is marked __ref because it calls __init function
 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
 */
static __ref void *spp_getpage(void)
T
Thomas Gleixner 已提交
143
{
L
Linus Torvalds 已提交
144
	void *ptr;
T
Thomas Gleixner 已提交
145

L
Linus Torvalds 已提交
146
	if (after_bootmem)
147
		ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
L
Linus Torvalds 已提交
148 149
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
150 151 152 153 154

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
155

156
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
157

L
Linus Torvalds 已提交
158
	return ptr;
T
Thomas Gleixner 已提交
159
}
L
Linus Torvalds 已提交
160

161
static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
L
Linus Torvalds 已提交
162
{
163 164 165 166 167 168 169 170 171
	if (pgd_none(*pgd)) {
		pud_t *pud = (pud_t *)spp_getpage();
		pgd_populate(&init_mm, pgd, pud);
		if (pud != pud_offset(pgd, 0))
			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
			       pud, pud_offset(pgd, 0));
	}
	return pud_offset(pgd, vaddr);
}
L
Linus Torvalds 已提交
172

173
static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
174
{
L
Linus Torvalds 已提交
175
	if (pud_none(*pud)) {
176
		pmd_t *pmd = (pmd_t *) spp_getpage();
177
		pud_populate(&init_mm, pud, pmd);
178
		if (pmd != pmd_offset(pud, 0))
179
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
180
			       pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
181
	}
182 183 184
	return pmd_offset(pud, vaddr);
}

185
static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
186
{
L
Linus Torvalds 已提交
187
	if (pmd_none(*pmd)) {
188
		pte_t *pte = (pte_t *) spp_getpage();
189
		pmd_populate_kernel(&init_mm, pmd, pte);
190
		if (pte != pte_offset_kernel(pmd, 0))
191
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
192
	}
193 194 195 196 197 198 199 200 201 202 203 204
	return pte_offset_kernel(pmd, vaddr);
}

void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
{
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	pud = pud_page + pud_index(vaddr);
	pmd = fill_pmd(pud, vaddr);
	pte = fill_pte(pmd, vaddr);
L
Linus Torvalds 已提交
205 206 207 208 209 210 211 212 213 214

	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

215
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
{
	pgd_t *pgd;
	pud_t *pud_page;

	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
		return;
	}
	pud_page = (pud_t*)pgd_page_vaddr(*pgd);
	set_pte_vaddr_pud(pud_page, vaddr, pteval);
}

232
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
233 234 235 236 237
{
	pgd_t *pgd;
	pud_t *pud;

	pgd = pgd_offset_k(vaddr);
238 239 240 241 242 243 244
	pud = fill_pud(pgd, vaddr);
	return fill_pmd(pud, vaddr);
}

pte_t * __init populate_extra_pte(unsigned long vaddr)
{
	pmd_t *pmd;
245

246 247
	pmd = populate_extra_pmd(vaddr);
	return fill_pte(pmd, vaddr);
248 249
}

250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289
/*
 * Create large page table mappings for a range of physical addresses.
 */
static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
						pgprot_t prot)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
		pgd = pgd_offset_k((unsigned long)__va(phys));
		if (pgd_none(*pgd)) {
			pud = (pud_t *) spp_getpage();
			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pud = pud_offset(pgd, (unsigned long)__va(phys));
		if (pud_none(*pud)) {
			pmd = (pmd_t *) spp_getpage();
			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pmd = pmd_offset(pud, phys);
		BUG_ON(!pmd_none(*pmd));
		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
	}
}

void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
}

void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
}

290
/*
291 292 293
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
294 295 296 297 298
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
299 300
 * We limit the mappings to the region from _text to _brk_end.  _brk_end
 * is rounded up to the 2MB boundary. This catches the invalid pmds as
301 302 303 304 305
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
306 307
	unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
	unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
308 309
	pmd_t *pmd = level2_kernel_pgt;

310
	for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
311
		if (pmd_none(*pmd))
312 313 314 315 316 317
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

318
static __ref void *alloc_low_page(unsigned long *phys)
T
Thomas Gleixner 已提交
319
{
320
	unsigned long pfn = pgt_buf_end++;
L
Linus Torvalds 已提交
321 322
	void *adr;

323
	if (after_bootmem) {
324
		adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
325
		*phys = __pa(adr);
T
Thomas Gleixner 已提交
326

327 328 329
		return adr;
	}

330
	if (pfn >= pgt_buf_top)
T
Thomas Gleixner 已提交
331
		panic("alloc_low_page: ran out of memory");
332

J
Jeremy Fitzhardinge 已提交
333
	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
334
	clear_page(adr);
335 336 337
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
L
Linus Torvalds 已提交
338

Y
Yinghai Lu 已提交
339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354
static __ref void *map_low_page(void *virt)
{
	void *adr;
	unsigned long phys, left;

	if (after_bootmem)
		return virt;

	phys = __pa(virt);
	left = phys & (PAGE_SIZE - 1);
	adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
	adr = (void *)(((unsigned long)adr) | left);

	return adr;
}

355
static __ref void unmap_low_page(void *adr)
T
Thomas Gleixner 已提交
356
{
357 358 359
	if (after_bootmem)
		return;

Y
Yinghai Lu 已提交
360
	early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
T
Thomas Gleixner 已提交
361
}
L
Linus Torvalds 已提交
362

363
static unsigned long __meminit
364 365
phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
	      pgprot_t prot)
366 367
{
	unsigned pages = 0;
368
	unsigned long last_map_addr = end;
369
	int i;
370

371 372 373 374 375 376 377 378 379 380 381 382
	pte_t *pte = pte_page + pte_index(addr);

	for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {

		if (addr >= end) {
			if (!after_bootmem) {
				for(; i < PTRS_PER_PTE; i++, pte++)
					set_pte(pte, __pte(0));
			}
			break;
		}

383 384 385 386 387 388
		/*
		 * We will re-use the existing mapping.
		 * Xen for example has some special requirements, like mapping
		 * pagetable pages as RO. So assume someone who pre-setup
		 * these mappings are more intelligent.
		 */
389 390
		if (pte_val(*pte)) {
			pages++;
391
			continue;
392
		}
393 394 395 396 397

		if (0)
			printk("   pte=%p addr=%lx pte=%016lx\n",
			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
		pages++;
398
		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
399
		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
400
	}
401

402
	update_page_count(PG_LEVEL_4K, pages);
403 404

	return last_map_addr;
405 406
}

407
static unsigned long __meminit
408
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
409
	      unsigned long page_size_mask, pgprot_t prot)
410
{
411
	unsigned long pages = 0, next;
412
	unsigned long last_map_addr = end;
413

414
	int i = pmd_index(address);
415

416
	for (; i < PTRS_PER_PMD; i++, address = next) {
417
		unsigned long pte_phys;
418
		pmd_t *pmd = pmd_page + pmd_index(address);
419
		pte_t *pte;
420
		pgprot_t new_prot = prot;
421

422
		if (address >= end) {
T
Thomas Gleixner 已提交
423
			if (!after_bootmem) {
424 425
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
T
Thomas Gleixner 已提交
426
			}
427 428
			break;
		}
429

430 431
		next = (address & PMD_MASK) + PMD_SIZE;

432
		if (pmd_val(*pmd)) {
433 434
			if (!pmd_large(*pmd)) {
				spin_lock(&init_mm.page_table_lock);
Y
Yinghai Lu 已提交
435 436
				pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
				last_map_addr = phys_pte_init(pte, address,
437
								end, prot);
Y
Yinghai Lu 已提交
438
				unmap_low_page(pte);
439
				spin_unlock(&init_mm.page_table_lock);
440
				continue;
441
			}
442 443 444 445 446 447 448 449 450 451 452 453
			/*
			 * If we are ok with PG_LEVEL_2M mapping, then we will
			 * use the existing mapping,
			 *
			 * Otherwise, we will split the large page mapping but
			 * use the same existing protection bits except for
			 * large page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
454
			if (page_size_mask & (1 << PG_LEVEL_2M)) {
455
				last_map_addr = next;
456
				continue;
457
			}
458
			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
459 460
		}

461
		if (page_size_mask & (1<<PG_LEVEL_2M)) {
462
			pages++;
463
			spin_lock(&init_mm.page_table_lock);
464
			set_pte((pte_t *)pmd,
465 466
				pfn_pte(address >> PAGE_SHIFT,
					__pgprot(pgprot_val(prot) | _PAGE_PSE)));
467
			spin_unlock(&init_mm.page_table_lock);
468
			last_map_addr = next;
469
			continue;
470
		}
471

472
		pte = alloc_low_page(&pte_phys);
473
		last_map_addr = phys_pte_init(pte, address, end, new_prot);
474 475
		unmap_low_page(pte);

476
		spin_lock(&init_mm.page_table_lock);
477
		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
478
		spin_unlock(&init_mm.page_table_lock);
479
	}
480
	update_page_count(PG_LEVEL_2M, pages);
481
	return last_map_addr;
482 483
}

484
static unsigned long __meminit
485 486
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
			 unsigned long page_size_mask)
T
Thomas Gleixner 已提交
487
{
488
	unsigned long pages = 0, next;
489
	unsigned long last_map_addr = end;
490
	int i = pud_index(addr);
491

492
	for (; i < PTRS_PER_PUD; i++, addr = next) {
493 494
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
495
		pmd_t *pmd;
496
		pgprot_t prot = PAGE_KERNEL;
L
Linus Torvalds 已提交
497

498
		if (addr >= end)
L
Linus Torvalds 已提交
499 500
			break;

501 502 503
		next = (addr & PUD_MASK) + PUD_SIZE;

		if (!after_bootmem && !e820_any_mapped(addr, next, 0)) {
T
Thomas Gleixner 已提交
504
			set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
505
			continue;
T
Thomas Gleixner 已提交
506
		}
L
Linus Torvalds 已提交
507

508
		if (pud_val(*pud)) {
509
			if (!pud_large(*pud)) {
Y
Yinghai Lu 已提交
510 511
				pmd = map_low_page(pmd_offset(pud, 0));
				last_map_addr = phys_pmd_init(pmd, addr, end,
512
							 page_size_mask, prot);
Y
Yinghai Lu 已提交
513 514
				unmap_low_page(pmd);
				__flush_tlb_all();
515 516
				continue;
			}
517 518 519 520 521 522 523 524 525 526 527 528
			/*
			 * If we are ok with PG_LEVEL_1G mapping, then we will
			 * use the existing mapping.
			 *
			 * Otherwise, we will split the gbpage mapping but use
			 * the same existing protection  bits except for large
			 * page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
529
			if (page_size_mask & (1 << PG_LEVEL_1G)) {
530
				last_map_addr = next;
531
				continue;
532
			}
533
			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
534 535
		}

536
		if (page_size_mask & (1<<PG_LEVEL_1G)) {
537
			pages++;
538
			spin_lock(&init_mm.page_table_lock);
539 540
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
541
			spin_unlock(&init_mm.page_table_lock);
542
			last_map_addr = next;
543 544 545
			continue;
		}

546
		pmd = alloc_low_page(&pmd_phys);
547 548
		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
					      prot);
549
		unmap_low_page(pmd);
550 551

		spin_lock(&init_mm.page_table_lock);
552
		pud_populate(&init_mm, pud, __va(pmd_phys));
553
		spin_unlock(&init_mm.page_table_lock);
L
Linus Torvalds 已提交
554
	}
A
Andi Kleen 已提交
555
	__flush_tlb_all();
556

557
	update_page_count(PG_LEVEL_1G, pages);
558

559
	return last_map_addr;
T
Thomas Gleixner 已提交
560
}
L
Linus Torvalds 已提交
561

562
unsigned long __meminit
563 564 565
kernel_physical_mapping_init(unsigned long start,
			     unsigned long end,
			     unsigned long page_size_mask)
T
Thomas Gleixner 已提交
566
{
567
	bool pgd_changed = false;
568
	unsigned long next, last_map_addr = end;
569
	unsigned long addr;
L
Linus Torvalds 已提交
570 571 572

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);
573
	addr = start;
L
Linus Torvalds 已提交
574 575

	for (; start < end; start = next) {
576
		pgd_t *pgd = pgd_offset_k(start);
T
Thomas Gleixner 已提交
577
		unsigned long pud_phys;
578 579
		pud_t *pud;

580
		next = (start + PGDIR_SIZE) & PGDIR_MASK;
581 582 583 584
		if (next > end)
			next = end;

		if (pgd_val(*pgd)) {
Y
Yinghai Lu 已提交
585 586
			pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
			last_map_addr = phys_pud_init(pud, __pa(start),
587
						 __pa(end), page_size_mask);
Y
Yinghai Lu 已提交
588
			unmap_low_page(pud);
589 590 591
			continue;
		}

592
		pud = alloc_low_page(&pud_phys);
593 594
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
						 page_size_mask);
595
		unmap_low_page(pud);
596 597 598 599

		spin_lock(&init_mm.page_table_lock);
		pgd_populate(&init_mm, pgd, __va(pud_phys));
		spin_unlock(&init_mm.page_table_lock);
600
		pgd_changed = true;
T
Thomas Gleixner 已提交
601
	}
602 603 604 605

	if (pgd_changed)
		sync_global_pgds(addr, end);

606
	__flush_tlb_all();
L
Linus Torvalds 已提交
607

608 609
	return last_map_addr;
}
610

611
#ifndef CONFIG_NUMA
612
void __init initmem_init(void)
613
{
T
Tejun Heo 已提交
614
	memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
615
}
616
#endif
617

L
Linus Torvalds 已提交
618 619
void __init paging_init(void)
{
620
	sparse_memory_present_with_active_regions(MAX_NUMNODES);
621
	sparse_init();
622 623 624 625 626 627 628 629 630

	/*
	 * clear the default setting with node 0
	 * note: don't use nodes_clear here, that is really clearing when
	 *	 numa support is not compiled in, and later node_set_state
	 *	 will not set it back.
	 */
	node_clear_state(0, N_NORMAL_MEMORY);

631
	zone_sizes_init();
L
Linus Torvalds 已提交
632 633
}

634 635 636
/*
 * Memory hotplug specific functions
 */
637
#ifdef CONFIG_MEMORY_HOTPLUG
638 639 640 641 642 643 644 645 646 647 648 649 650 651 652
/*
 * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
 * updating.
 */
static void  update_end_of_memory_vars(u64 start, u64 size)
{
	unsigned long end_pfn = PFN_UP(start + size);

	if (end_pfn > max_pfn) {
		max_pfn = end_pfn;
		max_low_pfn = end_pfn;
		high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
	}
}

653 654 655 656
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
657
int arch_add_memory(int nid, u64 start, u64 size)
658
{
659
	struct pglist_data *pgdat = NODE_DATA(nid);
660
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
661
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
662 663 664
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

665
	last_mapped_pfn = init_memory_mapping(start, start + size);
666 667
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
668

669
	ret = __add_pages(nid, zone, start_pfn, nr_pages);
670
	WARN_ON_ONCE(ret);
671

672 673 674
	/* update max_pfn, max_low_pfn and high_memory */
	update_end_of_memory_vars(start, size);

675 676
	return ret;
}
677
EXPORT_SYMBOL_GPL(arch_add_memory);
678

679 680
#endif /* CONFIG_MEMORY_HOTPLUG */

681
static struct kcore_list kcore_vsyscall;
L
Linus Torvalds 已提交
682 683 684

void __init mem_init(void)
{
685
	long codesize, reservedpages, datasize, initsize;
686
	unsigned long absent_pages;
L
Linus Torvalds 已提交
687

688
	pci_iommu_alloc();
L
Linus Torvalds 已提交
689

690
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
691 692 693 694

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
695
#ifdef CONFIG_NUMA
696
	totalram_pages = numa_free_all_bootmem();
L
Linus Torvalds 已提交
697
#else
698
	totalram_pages = free_all_bootmem();
L
Linus Torvalds 已提交
699
#endif
700 701 702

	absent_pages = absent_pages_in_range(0, max_pfn);
	reservedpages = max_pfn - totalram_pages - absent_pages;
L
Linus Torvalds 已提交
703 704 705 706 707 708 709
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
710
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
K
KAMEZAWA Hiroyuki 已提交
711
			 VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
L
Linus Torvalds 已提交
712

713
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
714
			 "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
715
		nr_free_pages() << (PAGE_SHIFT-10),
Y
Yinghai Lu 已提交
716
		max_pfn << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
717
		codesize >> 10,
718
		absent_pages << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
719 720 721 722 723
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
}

724
#ifdef CONFIG_DEBUG_RODATA
725 726
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
727

728
int kernel_set_to_readonly;
729 730 731

void set_kernel_text_rw(void)
{
732
	unsigned long start = PFN_ALIGN(_text);
733
	unsigned long end = PFN_ALIGN(__stop___ex_table);
734 735 736 737 738 739 740

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read write\n",
		 start, end);

741 742 743 744 745
	/*
	 * Make the kernel identity mapping for text RW. Kernel text
	 * mapping will always be RO. Refer to the comment in
	 * static_protections() in pageattr.c
	 */
746 747 748 749 750
	set_memory_rw(start, (end - start) >> PAGE_SHIFT);
}

void set_kernel_text_ro(void)
{
751
	unsigned long start = PFN_ALIGN(_text);
752
	unsigned long end = PFN_ALIGN(__stop___ex_table);
753 754 755 756 757 758 759

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read only\n",
		 start, end);

760 761 762
	/*
	 * Set the kernel identity mapping for text RO.
	 */
763 764 765
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
}

766 767
void mark_rodata_ro(void)
{
768
	unsigned long start = PFN_ALIGN(_text);
769 770
	unsigned long rodata_start =
		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
771 772 773 774
	unsigned long end = (unsigned long) &__end_rodata_hpage_align;
	unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
	unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
	unsigned long data_start = (unsigned long) &_sdata;
775

776
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
777
	       (end - start) >> 10);
778 779
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

780 781
	kernel_set_to_readonly = 1;

782 783 784 785
	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
786
	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
787

788 789
	rodata_test();

790
#ifdef CONFIG_CPA_DEBUG
791
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
792
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
793

794
	printk(KERN_INFO "Testing CPA: again\n");
795
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
796
#endif
797 798 799 800 801 802 803 804

	free_init_pages("unused kernel memory",
			(unsigned long) page_address(virt_to_page(text_end)),
			(unsigned long)
				 page_address(virt_to_page(rodata_start)));
	free_init_pages("unused kernel memory",
			(unsigned long) page_address(virt_to_page(rodata_end)),
			(unsigned long) page_address(virt_to_page(data_start)));
805
}
806

807 808
#endif

T
Thomas Gleixner 已提交
809 810
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
811
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
812 813 814 815
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
816 817

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
818 819
		return 0;

L
Linus Torvalds 已提交
820 821 822 823 824 825
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
826
		return 0;
L
Linus Torvalds 已提交
827 828 829 830

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
831

L
Linus Torvalds 已提交
832 833 834 835 836 837
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
838

L
Linus Torvalds 已提交
839 840 841
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
842 843 844 845 846
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
847
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
848 849 850 851
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
852 853
};

854
struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
L
Linus Torvalds 已提交
855 856
{
#ifdef CONFIG_IA32_EMULATION
857
	if (!mm || mm->context.ia32_compat)
858
		return NULL;
L
Linus Torvalds 已提交
859 860 861 862
#endif
	return &gate_vma;
}

863
int in_gate_area(struct mm_struct *mm, unsigned long addr)
L
Linus Torvalds 已提交
864
{
865
	struct vm_area_struct *vma = get_gate_vma(mm);
T
Thomas Gleixner 已提交
866

867 868
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
869

L
Linus Torvalds 已提交
870 871 872
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
873
/*
874 875 876
 * Use this when you have no reliable mm, typically from interrupt
 * context. It is less reliable than using a task's mm and may give
 * false positives.
L
Linus Torvalds 已提交
877
 */
878
int in_gate_area_no_mm(unsigned long addr)
L
Linus Torvalds 已提交
879
{
880
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
881
}
882

883 884 885 886 887 888 889 890
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
891

892 893 894 895 896 897 898 899 900 901 902
#ifdef CONFIG_X86_UV
unsigned long memory_block_size_bytes(void)
{
	if (is_uv_system()) {
		printk(KERN_INFO "UV: memory block size 2GB\n");
		return 2UL * 1024 * 1024 * 1024;
	}
	return MIN_MEMORY_BLOCK_SIZE;
}
#endif

903 904 905 906
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
907 908 909 910
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

T
Thomas Gleixner 已提交
911 912
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
913 914 915 916 917 918 919 920 921
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
922
		void *p = NULL;
923 924 925 926

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
927

928 929 930 931
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

932 933 934 935 936 937 938 939
		if (!cpu_has_pse) {
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			pmd = vmemmap_pmd_populate(pud, addr, node);

			if (!pmd)
				return -ENOMEM;

			p = vmemmap_pte_populate(pmd, addr, node);
T
Thomas Gleixner 已提交
940

941 942 943
			if (!p)
				return -ENOMEM;

944 945
			addr_end = addr + PAGE_SIZE;
			p_end = p + PAGE_SIZE;
T
Thomas Gleixner 已提交
946
		} else {
947 948 949 950 951 952
			next = pmd_addr_end(addr, end);

			pmd = pmd_offset(pud, addr);
			if (pmd_none(*pmd)) {
				pte_t entry;

953
				p = vmemmap_alloc_block_buf(PMD_SIZE, node);
954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969
				if (!p)
					return -ENOMEM;

				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
						PAGE_KERNEL_LARGE);
				set_pmd(pmd, __pmd(pte_val(entry)));

				/* check to see if we have contiguous blocks */
				if (p_end != p || node_start != node) {
					if (p_start)
						printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						       addr_start, addr_end-1, p_start, p_end-1, node_start);
					addr_start = addr;
					node_start = node;
					p_start = p;
				}
Y
Yinghai Lu 已提交
970 971 972

				addr_end = addr + PMD_SIZE;
				p_end = p + PMD_SIZE;
973 974
			} else
				vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
975
		}
976

977
	}
978
	sync_global_pgds((unsigned long)start_page, end);
979 980
	return 0;
}
981 982 983 984 985 986 987 988 989 990 991

void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
992
#endif