init_64.c 24.0 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
P
Pavel Machek 已提交
5
 *  Copyright (C) 2000  Pavel Machek <pavel@ucw.cz>
L
Linus Torvalds 已提交
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
T
Thomas Gleixner 已提交
21
#include <linux/initrd.h>
L
Linus Torvalds 已提交
22 23
#include <linux/pagemap.h>
#include <linux/bootmem.h>
24
#include <linux/memblock.h>
L
Linus Torvalds 已提交
25
#include <linux/proc_fs.h>
26
#include <linux/pci.h>
27
#include <linux/pfn.h>
28
#include <linux/poison.h>
29
#include <linux/dma-mapping.h>
30 31
#include <linux/module.h>
#include <linux/memory_hotplug.h>
32
#include <linux/nmi.h>
33
#include <linux/gfp.h>
L
Linus Torvalds 已提交
34 35

#include <asm/processor.h>
36
#include <asm/bios_ebda.h>
L
Linus Torvalds 已提交
37 38 39 40 41 42 43 44 45 46 47 48
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
49
#include <asm/sections.h>
50
#include <asm/kdebug.h>
51
#include <asm/numa.h>
52
#include <asm/cacheflush.h>
53
#include <asm/init.h>
54
#include <asm/uv/uv.h>
55
#include <asm/setup.h>
L
Linus Torvalds 已提交
56

I
Ingo Molnar 已提交
57 58 59 60 61 62 63 64 65 66 67 68 69 70
static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
71 72 73 74 75 76
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

77
pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
78 79 80 81
EXPORT_SYMBOL_GPL(__supported_pte_mask);

int force_personality32;

I
Ingo Molnar 已提交
82 83 84 85 86 87 88 89
/*
 * noexec32=on|off
 * Control non executable heap for 32bit processes.
 * To control the stack too use noexec=off
 *
 * on	PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
 * off	PROT_READ implies PROT_EXEC
 */
90 91 92 93 94 95 96 97 98 99
static int __init nonx32_setup(char *str)
{
	if (!strcmp(str, "on"))
		force_personality32 &= ~READ_IMPLIES_EXEC;
	else if (!strcmp(str, "off"))
		force_personality32 |= READ_IMPLIES_EXEC;
	return 1;
}
__setup("noexec32=", nonx32_setup);

100 101 102 103 104 105
/*
 * When memory was added/removed make sure all the processes MM have
 * suitable PGD entries in the local PGD level page.
 */
void sync_global_pgds(unsigned long start, unsigned long end)
{
106 107 108 109 110 111 112 113 114
	unsigned long address;

	for (address = start; address <= end; address += PGDIR_SIZE) {
		const pgd_t *pgd_ref = pgd_offset_k(address);
		struct page *page;

		if (pgd_none(*pgd_ref))
			continue;

A
Andrea Arcangeli 已提交
115
		spin_lock(&pgd_lock);
116 117
		list_for_each_entry(page, &pgd_list, lru) {
			pgd_t *pgd;
118 119
			spinlock_t *pgt_lock;

120
			pgd = (pgd_t *)page_address(page) + pgd_index(address);
A
Andrea Arcangeli 已提交
121
			/* the pgt_lock only for Xen */
122 123 124
			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
			spin_lock(pgt_lock);

125 126 127 128 129
			if (pgd_none(*pgd))
				set_pgd(pgd, *pgd_ref);
			else
				BUG_ON(pgd_page_vaddr(*pgd)
				       != pgd_page_vaddr(*pgd_ref));
130 131

			spin_unlock(pgt_lock);
132
		}
A
Andrea Arcangeli 已提交
133
		spin_unlock(&pgd_lock);
134
	}
135 136
}

137 138 139 140 141
/*
 * NOTE: This function is marked __ref because it calls __init function
 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
 */
static __ref void *spp_getpage(void)
T
Thomas Gleixner 已提交
142
{
L
Linus Torvalds 已提交
143
	void *ptr;
T
Thomas Gleixner 已提交
144

L
Linus Torvalds 已提交
145
	if (after_bootmem)
146
		ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
L
Linus Torvalds 已提交
147 148
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
149 150 151 152 153

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
154

155
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
156

L
Linus Torvalds 已提交
157
	return ptr;
T
Thomas Gleixner 已提交
158
}
L
Linus Torvalds 已提交
159

160
static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
L
Linus Torvalds 已提交
161
{
162 163 164 165 166 167 168 169 170
	if (pgd_none(*pgd)) {
		pud_t *pud = (pud_t *)spp_getpage();
		pgd_populate(&init_mm, pgd, pud);
		if (pud != pud_offset(pgd, 0))
			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
			       pud, pud_offset(pgd, 0));
	}
	return pud_offset(pgd, vaddr);
}
L
Linus Torvalds 已提交
171

172
static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
173
{
L
Linus Torvalds 已提交
174
	if (pud_none(*pud)) {
175
		pmd_t *pmd = (pmd_t *) spp_getpage();
176
		pud_populate(&init_mm, pud, pmd);
177
		if (pmd != pmd_offset(pud, 0))
178
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
179
			       pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
180
	}
181 182 183
	return pmd_offset(pud, vaddr);
}

184
static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
185
{
L
Linus Torvalds 已提交
186
	if (pmd_none(*pmd)) {
187
		pte_t *pte = (pte_t *) spp_getpage();
188
		pmd_populate_kernel(&init_mm, pmd, pte);
189
		if (pte != pte_offset_kernel(pmd, 0))
190
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
191
	}
192 193 194 195 196 197 198 199 200 201 202 203
	return pte_offset_kernel(pmd, vaddr);
}

void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
{
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	pud = pud_page + pud_index(vaddr);
	pmd = fill_pmd(pud, vaddr);
	pte = fill_pte(pmd, vaddr);
L
Linus Torvalds 已提交
204 205 206 207 208 209 210 211 212 213

	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

214
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
{
	pgd_t *pgd;
	pud_t *pud_page;

	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
		return;
	}
	pud_page = (pud_t*)pgd_page_vaddr(*pgd);
	set_pte_vaddr_pud(pud_page, vaddr, pteval);
}

231
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
232 233 234 235 236
{
	pgd_t *pgd;
	pud_t *pud;

	pgd = pgd_offset_k(vaddr);
237 238 239 240 241 242 243
	pud = fill_pud(pgd, vaddr);
	return fill_pmd(pud, vaddr);
}

pte_t * __init populate_extra_pte(unsigned long vaddr)
{
	pmd_t *pmd;
244

245 246
	pmd = populate_extra_pmd(vaddr);
	return fill_pte(pmd, vaddr);
247 248
}

249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
/*
 * Create large page table mappings for a range of physical addresses.
 */
static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
						pgprot_t prot)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
		pgd = pgd_offset_k((unsigned long)__va(phys));
		if (pgd_none(*pgd)) {
			pud = (pud_t *) spp_getpage();
			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pud = pud_offset(pgd, (unsigned long)__va(phys));
		if (pud_none(*pud)) {
			pmd = (pmd_t *) spp_getpage();
			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pmd = pmd_offset(pud, phys);
		BUG_ON(!pmd_none(*pmd));
		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
	}
}

void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
}

void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
}

289
/*
290 291 292
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
293 294 295 296 297
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
298 299
 * We limit the mappings to the region from _text to _brk_end.  _brk_end
 * is rounded up to the 2MB boundary. This catches the invalid pmds as
300 301 302 303 304
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
305 306
	unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
	unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
307 308
	pmd_t *pmd = level2_kernel_pgt;

309
	for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
310
		if (pmd_none(*pmd))
311 312 313 314 315 316
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

317
static __ref void *alloc_low_page(unsigned long *phys)
T
Thomas Gleixner 已提交
318
{
319
	unsigned long pfn = pgt_buf_end++;
L
Linus Torvalds 已提交
320 321
	void *adr;

322
	if (after_bootmem) {
323
		adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
324
		*phys = __pa(adr);
T
Thomas Gleixner 已提交
325

326 327 328
		return adr;
	}

329
	if (pfn >= pgt_buf_top)
T
Thomas Gleixner 已提交
330
		panic("alloc_low_page: ran out of memory");
331

J
Jeremy Fitzhardinge 已提交
332
	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
333
	clear_page(adr);
334 335 336
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
L
Linus Torvalds 已提交
337

Y
Yinghai Lu 已提交
338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353
static __ref void *map_low_page(void *virt)
{
	void *adr;
	unsigned long phys, left;

	if (after_bootmem)
		return virt;

	phys = __pa(virt);
	left = phys & (PAGE_SIZE - 1);
	adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
	adr = (void *)(((unsigned long)adr) | left);

	return adr;
}

354
static __ref void unmap_low_page(void *adr)
T
Thomas Gleixner 已提交
355
{
356 357 358
	if (after_bootmem)
		return;

Y
Yinghai Lu 已提交
359
	early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
T
Thomas Gleixner 已提交
360
}
L
Linus Torvalds 已提交
361

362
static unsigned long __meminit
363 364
phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
	      pgprot_t prot)
365 366
{
	unsigned pages = 0;
367
	unsigned long last_map_addr = end;
368
	int i;
369

370 371 372 373 374 375 376 377 378 379 380 381
	pte_t *pte = pte_page + pte_index(addr);

	for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {

		if (addr >= end) {
			if (!after_bootmem) {
				for(; i < PTRS_PER_PTE; i++, pte++)
					set_pte(pte, __pte(0));
			}
			break;
		}

382 383 384 385 386 387
		/*
		 * We will re-use the existing mapping.
		 * Xen for example has some special requirements, like mapping
		 * pagetable pages as RO. So assume someone who pre-setup
		 * these mappings are more intelligent.
		 */
388 389
		if (pte_val(*pte)) {
			pages++;
390
			continue;
391
		}
392 393 394 395 396

		if (0)
			printk("   pte=%p addr=%lx pte=%016lx\n",
			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
		pages++;
397
		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
398
		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
399
	}
400

401
	update_page_count(PG_LEVEL_4K, pages);
402 403

	return last_map_addr;
404 405
}

406
static unsigned long __meminit
407
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
408
	      unsigned long page_size_mask, pgprot_t prot)
409
{
410
	unsigned long pages = 0;
411
	unsigned long last_map_addr = end;
412

413
	int i = pmd_index(address);
414

415
	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
416
		unsigned long pte_phys;
417
		pmd_t *pmd = pmd_page + pmd_index(address);
418
		pte_t *pte;
419
		pgprot_t new_prot = prot;
420

421
		if (address >= end) {
T
Thomas Gleixner 已提交
422
			if (!after_bootmem) {
423 424
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
T
Thomas Gleixner 已提交
425
			}
426 427
			break;
		}
428

429
		if (pmd_val(*pmd)) {
430 431
			if (!pmd_large(*pmd)) {
				spin_lock(&init_mm.page_table_lock);
Y
Yinghai Lu 已提交
432 433
				pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
				last_map_addr = phys_pte_init(pte, address,
434
								end, prot);
Y
Yinghai Lu 已提交
435
				unmap_low_page(pte);
436
				spin_unlock(&init_mm.page_table_lock);
437
				continue;
438
			}
439 440 441 442 443 444 445 446 447 448 449 450
			/*
			 * If we are ok with PG_LEVEL_2M mapping, then we will
			 * use the existing mapping,
			 *
			 * Otherwise, we will split the large page mapping but
			 * use the same existing protection bits except for
			 * large page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
451 452
			if (page_size_mask & (1 << PG_LEVEL_2M)) {
				pages++;
453
				continue;
454
			}
455
			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
456 457
		}

458
		if (page_size_mask & (1<<PG_LEVEL_2M)) {
459
			pages++;
460
			spin_lock(&init_mm.page_table_lock);
461
			set_pte((pte_t *)pmd,
462 463
				pfn_pte(address >> PAGE_SHIFT,
					__pgprot(pgprot_val(prot) | _PAGE_PSE)));
464
			spin_unlock(&init_mm.page_table_lock);
465
			last_map_addr = (address & PMD_MASK) + PMD_SIZE;
466
			continue;
467
		}
468

469
		pte = alloc_low_page(&pte_phys);
470
		last_map_addr = phys_pte_init(pte, address, end, new_prot);
471 472
		unmap_low_page(pte);

473
		spin_lock(&init_mm.page_table_lock);
474
		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
475
		spin_unlock(&init_mm.page_table_lock);
476
	}
477
	update_page_count(PG_LEVEL_2M, pages);
478
	return last_map_addr;
479 480
}

481
static unsigned long __meminit
482 483
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
			 unsigned long page_size_mask)
T
Thomas Gleixner 已提交
484
{
485
	unsigned long pages = 0;
486
	unsigned long last_map_addr = end;
487
	int i = pud_index(addr);
488

T
Thomas Gleixner 已提交
489
	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
490 491
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
492
		pmd_t *pmd;
493
		pgprot_t prot = PAGE_KERNEL;
L
Linus Torvalds 已提交
494

495
		if (addr >= end)
L
Linus Torvalds 已提交
496 497
			break;

T
Thomas Gleixner 已提交
498 499 500
		if (!after_bootmem &&
				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
			set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
501
			continue;
T
Thomas Gleixner 已提交
502
		}
L
Linus Torvalds 已提交
503

504
		if (pud_val(*pud)) {
505
			if (!pud_large(*pud)) {
Y
Yinghai Lu 已提交
506 507
				pmd = map_low_page(pmd_offset(pud, 0));
				last_map_addr = phys_pmd_init(pmd, addr, end,
508
							 page_size_mask, prot);
Y
Yinghai Lu 已提交
509 510
				unmap_low_page(pmd);
				__flush_tlb_all();
511 512
				continue;
			}
513 514 515 516 517 518 519 520 521 522 523 524
			/*
			 * If we are ok with PG_LEVEL_1G mapping, then we will
			 * use the existing mapping.
			 *
			 * Otherwise, we will split the gbpage mapping but use
			 * the same existing protection  bits except for large
			 * page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
525 526
			if (page_size_mask & (1 << PG_LEVEL_1G)) {
				pages++;
527
				continue;
528
			}
529
			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
530 531
		}

532
		if (page_size_mask & (1<<PG_LEVEL_1G)) {
533
			pages++;
534
			spin_lock(&init_mm.page_table_lock);
535 536
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
537
			spin_unlock(&init_mm.page_table_lock);
538
			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
539 540 541
			continue;
		}

542
		pmd = alloc_low_page(&pmd_phys);
543 544
		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
					      prot);
545
		unmap_low_page(pmd);
546 547

		spin_lock(&init_mm.page_table_lock);
548
		pud_populate(&init_mm, pud, __va(pmd_phys));
549
		spin_unlock(&init_mm.page_table_lock);
L
Linus Torvalds 已提交
550
	}
A
Andi Kleen 已提交
551
	__flush_tlb_all();
552

553
	update_page_count(PG_LEVEL_1G, pages);
554

555
	return last_map_addr;
T
Thomas Gleixner 已提交
556
}
L
Linus Torvalds 已提交
557

558
unsigned long __meminit
559 560 561
kernel_physical_mapping_init(unsigned long start,
			     unsigned long end,
			     unsigned long page_size_mask)
T
Thomas Gleixner 已提交
562
{
563
	bool pgd_changed = false;
564
	unsigned long next, last_map_addr = end;
565
	unsigned long addr;
L
Linus Torvalds 已提交
566 567 568

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);
569
	addr = start;
L
Linus Torvalds 已提交
570 571

	for (; start < end; start = next) {
572
		pgd_t *pgd = pgd_offset_k(start);
T
Thomas Gleixner 已提交
573
		unsigned long pud_phys;
574 575
		pud_t *pud;

576
		next = (start + PGDIR_SIZE) & PGDIR_MASK;
577 578 579 580
		if (next > end)
			next = end;

		if (pgd_val(*pgd)) {
Y
Yinghai Lu 已提交
581 582
			pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
			last_map_addr = phys_pud_init(pud, __pa(start),
583
						 __pa(end), page_size_mask);
Y
Yinghai Lu 已提交
584
			unmap_low_page(pud);
585 586 587
			continue;
		}

588
		pud = alloc_low_page(&pud_phys);
589 590
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
						 page_size_mask);
591
		unmap_low_page(pud);
592 593 594 595

		spin_lock(&init_mm.page_table_lock);
		pgd_populate(&init_mm, pgd, __va(pud_phys));
		spin_unlock(&init_mm.page_table_lock);
596
		pgd_changed = true;
T
Thomas Gleixner 已提交
597
	}
598 599 600 601

	if (pgd_changed)
		sync_global_pgds(addr, end);

602
	__flush_tlb_all();
L
Linus Torvalds 已提交
603

604 605
	return last_map_addr;
}
606

607
#ifndef CONFIG_NUMA
608
void __init initmem_init(void)
609
{
610
	memblock_x86_register_active_regions(0, 0, max_pfn);
611
}
612
#endif
613

L
Linus Torvalds 已提交
614 615
void __init paging_init(void)
{
616
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
617

618
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
619
#ifdef CONFIG_ZONE_DMA
620
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
621
#endif
622
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
Y
Yinghai Lu 已提交
623
	max_zone_pfns[ZONE_NORMAL] = max_pfn;
624

625
	sparse_memory_present_with_active_regions(MAX_NUMNODES);
626
	sparse_init();
627 628 629 630 631 632 633 634 635

	/*
	 * clear the default setting with node 0
	 * note: don't use nodes_clear here, that is really clearing when
	 *	 numa support is not compiled in, and later node_set_state
	 *	 will not set it back.
	 */
	node_clear_state(0, N_NORMAL_MEMORY);

636
	free_area_init_nodes(max_zone_pfns);
L
Linus Torvalds 已提交
637 638
}

639 640 641
/*
 * Memory hotplug specific functions
 */
642
#ifdef CONFIG_MEMORY_HOTPLUG
643 644 645 646 647 648 649 650 651 652 653 654 655 656 657
/*
 * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
 * updating.
 */
static void  update_end_of_memory_vars(u64 start, u64 size)
{
	unsigned long end_pfn = PFN_UP(start + size);

	if (end_pfn > max_pfn) {
		max_pfn = end_pfn;
		max_low_pfn = end_pfn;
		high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
	}
}

658 659 660 661
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
662
int arch_add_memory(int nid, u64 start, u64 size)
663
{
664
	struct pglist_data *pgdat = NODE_DATA(nid);
665
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
666
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
667 668 669
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

670
	last_mapped_pfn = init_memory_mapping(start, start + size);
671 672
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
673

674
	ret = __add_pages(nid, zone, start_pfn, nr_pages);
675
	WARN_ON_ONCE(ret);
676

677 678 679
	/* update max_pfn, max_low_pfn and high_memory */
	update_end_of_memory_vars(start, size);

680 681
	return ret;
}
682
EXPORT_SYMBOL_GPL(arch_add_memory);
683

684 685
#endif /* CONFIG_MEMORY_HOTPLUG */

686
static struct kcore_list kcore_vsyscall;
L
Linus Torvalds 已提交
687 688 689

void __init mem_init(void)
{
690
	long codesize, reservedpages, datasize, initsize;
691
	unsigned long absent_pages;
L
Linus Torvalds 已提交
692

693
	pci_iommu_alloc();
L
Linus Torvalds 已提交
694

695
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
696 697 698 699

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
700
#ifdef CONFIG_NUMA
701
	totalram_pages = numa_free_all_bootmem();
L
Linus Torvalds 已提交
702
#else
703
	totalram_pages = free_all_bootmem();
L
Linus Torvalds 已提交
704
#endif
705 706 707

	absent_pages = absent_pages_in_range(0, max_pfn);
	reservedpages = max_pfn - totalram_pages - absent_pages;
L
Linus Torvalds 已提交
708 709 710 711 712 713 714
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
715
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
K
KAMEZAWA Hiroyuki 已提交
716
			 VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
L
Linus Torvalds 已提交
717

718
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
719
			 "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
720
		nr_free_pages() << (PAGE_SHIFT-10),
Y
Yinghai Lu 已提交
721
		max_pfn << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
722
		codesize >> 10,
723
		absent_pages << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
724 725 726 727 728
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
}

729
#ifdef CONFIG_DEBUG_RODATA
730 731
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
732

733
int kernel_set_to_readonly;
734 735 736

void set_kernel_text_rw(void)
{
737
	unsigned long start = PFN_ALIGN(_text);
738
	unsigned long end = PFN_ALIGN(__stop___ex_table);
739 740 741 742 743 744 745

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read write\n",
		 start, end);

746 747 748 749 750
	/*
	 * Make the kernel identity mapping for text RW. Kernel text
	 * mapping will always be RO. Refer to the comment in
	 * static_protections() in pageattr.c
	 */
751 752 753 754 755
	set_memory_rw(start, (end - start) >> PAGE_SHIFT);
}

void set_kernel_text_ro(void)
{
756
	unsigned long start = PFN_ALIGN(_text);
757
	unsigned long end = PFN_ALIGN(__stop___ex_table);
758 759 760 761 762 763 764

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read only\n",
		 start, end);

765 766 767
	/*
	 * Set the kernel identity mapping for text RO.
	 */
768 769 770
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
}

771 772
void mark_rodata_ro(void)
{
773
	unsigned long start = PFN_ALIGN(_text);
774 775
	unsigned long rodata_start =
		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
776 777 778 779
	unsigned long end = (unsigned long) &__end_rodata_hpage_align;
	unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
	unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
	unsigned long data_start = (unsigned long) &_sdata;
780

781
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
782
	       (end - start) >> 10);
783 784
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

785 786
	kernel_set_to_readonly = 1;

787 788 789 790
	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
791
	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
792

793 794
	rodata_test();

795
#ifdef CONFIG_CPA_DEBUG
796
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
797
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
798

799
	printk(KERN_INFO "Testing CPA: again\n");
800
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
801
#endif
802 803 804 805 806 807 808 809

	free_init_pages("unused kernel memory",
			(unsigned long) page_address(virt_to_page(text_end)),
			(unsigned long)
				 page_address(virt_to_page(rodata_start)));
	free_init_pages("unused kernel memory",
			(unsigned long) page_address(virt_to_page(rodata_end)),
			(unsigned long) page_address(virt_to_page(data_start)));
810
}
811

812 813
#endif

T
Thomas Gleixner 已提交
814 815
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
816
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
817 818 819 820
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
821 822

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
823 824
		return 0;

L
Linus Torvalds 已提交
825 826 827 828 829 830
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
831
		return 0;
L
Linus Torvalds 已提交
832 833 834 835

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
836

L
Linus Torvalds 已提交
837 838 839 840 841 842
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
843

L
Linus Torvalds 已提交
844 845 846
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
847 848 849 850 851
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
852
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
853 854 855 856
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
857 858
};

859
struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
L
Linus Torvalds 已提交
860 861
{
#ifdef CONFIG_IA32_EMULATION
862
	if (!mm || mm->context.ia32_compat)
863
		return NULL;
L
Linus Torvalds 已提交
864 865 866 867
#endif
	return &gate_vma;
}

868
int in_gate_area(struct mm_struct *mm, unsigned long addr)
L
Linus Torvalds 已提交
869
{
870
	struct vm_area_struct *vma = get_gate_vma(mm);
T
Thomas Gleixner 已提交
871

872 873
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
874

L
Linus Torvalds 已提交
875 876 877
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
878
/*
879 880 881
 * Use this when you have no reliable mm, typically from interrupt
 * context. It is less reliable than using a task's mm and may give
 * false positives.
L
Linus Torvalds 已提交
882
 */
883
int in_gate_area_no_mm(unsigned long addr)
L
Linus Torvalds 已提交
884
{
885
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
886
}
887

888 889 890 891 892 893 894 895
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
896

897 898 899 900 901 902 903 904 905 906 907 908 909
#ifdef CONFIG_X86_UV
#define MIN_MEMORY_BLOCK_SIZE   (1 << SECTION_SIZE_BITS)

unsigned long memory_block_size_bytes(void)
{
	if (is_uv_system()) {
		printk(KERN_INFO "UV: memory block size 2GB\n");
		return 2UL * 1024 * 1024 * 1024;
	}
	return MIN_MEMORY_BLOCK_SIZE;
}
#endif

910 911 912 913
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
914 915 916 917
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

T
Thomas Gleixner 已提交
918 919
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
920 921 922 923 924 925 926 927 928
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
929
		void *p = NULL;
930 931 932 933

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
934

935 936 937 938
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

939 940 941 942 943 944 945 946
		if (!cpu_has_pse) {
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			pmd = vmemmap_pmd_populate(pud, addr, node);

			if (!pmd)
				return -ENOMEM;

			p = vmemmap_pte_populate(pmd, addr, node);
T
Thomas Gleixner 已提交
947

948 949 950
			if (!p)
				return -ENOMEM;

951 952
			addr_end = addr + PAGE_SIZE;
			p_end = p + PAGE_SIZE;
T
Thomas Gleixner 已提交
953
		} else {
954 955 956 957 958 959
			next = pmd_addr_end(addr, end);

			pmd = pmd_offset(pud, addr);
			if (pmd_none(*pmd)) {
				pte_t entry;

960
				p = vmemmap_alloc_block_buf(PMD_SIZE, node);
961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976
				if (!p)
					return -ENOMEM;

				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
						PAGE_KERNEL_LARGE);
				set_pmd(pmd, __pmd(pte_val(entry)));

				/* check to see if we have contiguous blocks */
				if (p_end != p || node_start != node) {
					if (p_start)
						printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						       addr_start, addr_end-1, p_start, p_end-1, node_start);
					addr_start = addr;
					node_start = node;
					p_start = p;
				}
Y
Yinghai Lu 已提交
977 978 979

				addr_end = addr + PMD_SIZE;
				p_end = p + PMD_SIZE;
980 981
			} else
				vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
982
		}
983

984
	}
985
	sync_global_pgds((unsigned long)start_page, end);
986 987
	return 0;
}
988 989 990 991 992 993 994 995 996 997 998

void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
999
#endif