init_64.c 38.7 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
P
Pavel Machek 已提交
5
 *  Copyright (C) 2000  Pavel Machek <pavel@ucw.cz>
L
Linus Torvalds 已提交
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
T
Thomas Gleixner 已提交
21
#include <linux/initrd.h>
L
Linus Torvalds 已提交
22 23
#include <linux/pagemap.h>
#include <linux/bootmem.h>
24
#include <linux/memblock.h>
L
Linus Torvalds 已提交
25
#include <linux/proc_fs.h>
26
#include <linux/pci.h>
27
#include <linux/pfn.h>
28
#include <linux/poison.h>
29
#include <linux/dma-mapping.h>
30
#include <linux/memory.h>
31
#include <linux/memory_hotplug.h>
32
#include <linux/memremap.h>
33
#include <linux/nmi.h>
34
#include <linux/gfp.h>
35
#include <linux/kcore.h>
L
Linus Torvalds 已提交
36 37

#include <asm/processor.h>
38
#include <asm/bios_ebda.h>
39
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
40 41 42 43
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
44
#include <asm/e820/api.h>
L
Linus Torvalds 已提交
45 46 47 48 49
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
50
#include <asm/sections.h>
51
#include <asm/kdebug.h>
52
#include <asm/numa.h>
L
Laura Abbott 已提交
53
#include <asm/set_memory.h>
54
#include <asm/init.h>
55
#include <asm/uv/uv.h>
56
#include <asm/setup.h>
L
Linus Torvalds 已提交
57

58 59
#include "mm_internal.h"

60
#include "ident_map.c"
61

L
Linus Torvalds 已提交
62 63 64 65 66 67
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

68
/* Bits supported by the hardware: */
69
pteval_t __supported_pte_mask __read_mostly = ~0;
70 71
/* Bits allowed in normal kernel mappings: */
pteval_t __default_kernel_pte_mask __read_mostly = ~0;
72
EXPORT_SYMBOL_GPL(__supported_pte_mask);
73 74
/* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */
EXPORT_SYMBOL(__default_kernel_pte_mask);
75 76 77

int force_personality32;

I
Ingo Molnar 已提交
78 79 80 81 82 83 84 85
/*
 * noexec32=on|off
 * Control non executable heap for 32bit processes.
 * To control the stack too use noexec=off
 *
 * on	PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
 * off	PROT_READ implies PROT_EXEC
 */
86 87 88 89 90 91 92 93 94 95
static int __init nonx32_setup(char *str)
{
	if (!strcmp(str, "on"))
		force_personality32 &= ~READ_IMPLIES_EXEC;
	else if (!strcmp(str, "off"))
		force_personality32 |= READ_IMPLIES_EXEC;
	return 1;
}
__setup("noexec32=", nonx32_setup);

96
static void sync_global_pgds_l5(unsigned long start, unsigned long end)
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
{
	unsigned long addr;

	for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
		const pgd_t *pgd_ref = pgd_offset_k(addr);
		struct page *page;

		/* Check for overflow */
		if (addr < start)
			break;

		if (pgd_none(*pgd_ref))
			continue;

		spin_lock(&pgd_lock);
		list_for_each_entry(page, &pgd_list, lru) {
			pgd_t *pgd;
			spinlock_t *pgt_lock;

			pgd = (pgd_t *)page_address(page) + pgd_index(addr);
			/* the pgt_lock only for Xen */
			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
			spin_lock(pgt_lock);

			if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));

			if (pgd_none(*pgd))
				set_pgd(pgd, *pgd_ref);

			spin_unlock(pgt_lock);
		}
		spin_unlock(&pgd_lock);
	}
}
132 133

static void sync_global_pgds_l4(unsigned long start, unsigned long end)
134
{
135
	unsigned long addr;
136

137 138
	for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
		pgd_t *pgd_ref = pgd_offset_k(addr);
139
		const p4d_t *p4d_ref;
140 141
		struct page *page;

142 143 144 145
		/*
		 * With folded p4d, pgd_none() is always false, we need to
		 * handle synchonization on p4d level.
		 */
146
		MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref));
147
		p4d_ref = p4d_offset(pgd_ref, addr);
148 149

		if (p4d_none(*p4d_ref))
150 151
			continue;

A
Andrea Arcangeli 已提交
152
		spin_lock(&pgd_lock);
153
		list_for_each_entry(page, &pgd_list, lru) {
154
			pgd_t *pgd;
155
			p4d_t *p4d;
156 157
			spinlock_t *pgt_lock;

158 159
			pgd = (pgd_t *)page_address(page) + pgd_index(addr);
			p4d = p4d_offset(pgd, addr);
A
Andrea Arcangeli 已提交
160
			/* the pgt_lock only for Xen */
161 162 163
			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
			spin_lock(pgt_lock);

164 165 166
			if (!p4d_none(*p4d_ref) && !p4d_none(*p4d))
				BUG_ON(p4d_page_vaddr(*p4d)
				       != p4d_page_vaddr(*p4d_ref));
167

168 169
			if (p4d_none(*p4d))
				set_p4d(p4d, *p4d_ref);
170

171
			spin_unlock(pgt_lock);
172
		}
A
Andrea Arcangeli 已提交
173
		spin_unlock(&pgd_lock);
174
	}
175
}
176 177 178 179 180 181 182

/*
 * When memory was added make sure all the processes MM have
 * suitable PGD entries in the local PGD level page.
 */
void sync_global_pgds(unsigned long start, unsigned long end)
{
183
	if (pgtable_l5_enabled())
184 185 186 187
		sync_global_pgds_l5(start, end);
	else
		sync_global_pgds_l4(start, end);
}
188

189 190 191 192 193
/*
 * NOTE: This function is marked __ref because it calls __init function
 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
 */
static __ref void *spp_getpage(void)
T
Thomas Gleixner 已提交
194
{
L
Linus Torvalds 已提交
195
	void *ptr;
T
Thomas Gleixner 已提交
196

L
Linus Torvalds 已提交
197
	if (after_bootmem)
198
		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
L
Linus Torvalds 已提交
199 200
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
201 202 203 204 205

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
206

207
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
208

L
Linus Torvalds 已提交
209
	return ptr;
T
Thomas Gleixner 已提交
210
}
L
Linus Torvalds 已提交
211

212
static p4d_t *fill_p4d(pgd_t *pgd, unsigned long vaddr)
L
Linus Torvalds 已提交
213
{
214
	if (pgd_none(*pgd)) {
215 216 217
		p4d_t *p4d = (p4d_t *)spp_getpage();
		pgd_populate(&init_mm, pgd, p4d);
		if (p4d != p4d_offset(pgd, 0))
218
			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
219 220 221 222 223 224 225 226 227 228 229 230 231
			       p4d, p4d_offset(pgd, 0));
	}
	return p4d_offset(pgd, vaddr);
}

static pud_t *fill_pud(p4d_t *p4d, unsigned long vaddr)
{
	if (p4d_none(*p4d)) {
		pud_t *pud = (pud_t *)spp_getpage();
		p4d_populate(&init_mm, p4d, pud);
		if (pud != pud_offset(p4d, 0))
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
			       pud, pud_offset(p4d, 0));
232
	}
233
	return pud_offset(p4d, vaddr);
234
}
L
Linus Torvalds 已提交
235

236
static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
237
{
L
Linus Torvalds 已提交
238
	if (pud_none(*pud)) {
239
		pmd_t *pmd = (pmd_t *) spp_getpage();
240
		pud_populate(&init_mm, pud, pmd);
241
		if (pmd != pmd_offset(pud, 0))
242
			printk(KERN_ERR "PAGETABLE BUG #02! %p <-> %p\n",
243
			       pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
244
	}
245 246 247
	return pmd_offset(pud, vaddr);
}

248
static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
249
{
L
Linus Torvalds 已提交
250
	if (pmd_none(*pmd)) {
251
		pte_t *pte = (pte_t *) spp_getpage();
252
		pmd_populate_kernel(&init_mm, pmd, pte);
253
		if (pte != pte_offset_kernel(pmd, 0))
254
			printk(KERN_ERR "PAGETABLE BUG #03!\n");
L
Linus Torvalds 已提交
255
	}
256 257 258
	return pte_offset_kernel(pmd, vaddr);
}

259
static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte)
260
{
261 262
	pmd_t *pmd = fill_pmd(pud, vaddr);
	pte_t *pte = fill_pte(pmd, vaddr);
L
Linus Torvalds 已提交
263 264 265 266 267 268 269

	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
270
	__flush_tlb_one_kernel(vaddr);
L
Linus Torvalds 已提交
271 272
}

273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte)
{
	p4d_t *p4d = p4d_page + p4d_index(vaddr);
	pud_t *pud = fill_pud(p4d, vaddr);

	__set_pte_vaddr(pud, vaddr, new_pte);
}

void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
{
	pud_t *pud = pud_page + pud_index(vaddr);

	__set_pte_vaddr(pud, vaddr, new_pte);
}

288
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
289 290
{
	pgd_t *pgd;
291
	p4d_t *p4d_page;
292 293 294 295 296 297 298 299 300

	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
		return;
	}
301 302 303

	p4d_page = p4d_offset(pgd, 0);
	set_pte_vaddr_p4d(p4d_page, vaddr, pteval);
304 305
}

306
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
307 308
{
	pgd_t *pgd;
309
	p4d_t *p4d;
310 311 312
	pud_t *pud;

	pgd = pgd_offset_k(vaddr);
313 314
	p4d = fill_p4d(pgd, vaddr);
	pud = fill_pud(p4d, vaddr);
315 316 317 318 319 320
	return fill_pmd(pud, vaddr);
}

pte_t * __init populate_extra_pte(unsigned long vaddr)
{
	pmd_t *pmd;
321

322 323
	pmd = populate_extra_pmd(vaddr);
	return fill_pte(pmd, vaddr);
324 325
}

326 327 328 329
/*
 * Create large page table mappings for a range of physical addresses.
 */
static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
330
					enum page_cache_mode cache)
331 332
{
	pgd_t *pgd;
333
	p4d_t *p4d;
334 335
	pud_t *pud;
	pmd_t *pmd;
336
	pgprot_t prot;
337

338 339
	pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) |
		pgprot_val(pgprot_4k_2_large(cachemode2pgprot(cache)));
340 341 342 343
	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
		pgd = pgd_offset_k((unsigned long)__va(phys));
		if (pgd_none(*pgd)) {
344 345 346 347 348 349
			p4d = (p4d_t *) spp_getpage();
			set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		p4d = p4d_offset(pgd, (unsigned long)__va(phys));
		if (p4d_none(*p4d)) {
350
			pud = (pud_t *) spp_getpage();
351
			set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE |
352 353
						_PAGE_USER));
		}
354
		pud = pud_offset(p4d, (unsigned long)__va(phys));
355 356 357 358 359 360 361 362 363 364 365 366 367
		if (pud_none(*pud)) {
			pmd = (pmd_t *) spp_getpage();
			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pmd = pmd_offset(pud, phys);
		BUG_ON(!pmd_none(*pmd));
		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
	}
}

void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
{
368
	__init_extra_mapping(phys, size, _PAGE_CACHE_MODE_WB);
369 370 371 372
}

void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
{
373
	__init_extra_mapping(phys, size, _PAGE_CACHE_MODE_UC);
374 375
}

376
/*
377 378 379
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
380
 *
381
 * phys_base holds the negative offset to the kernel, which is added
382 383 384
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
385 386
 * We limit the mappings to the region from _text to _brk_end.  _brk_end
 * is rounded up to the 2MB boundary. This catches the invalid pmds as
387 388 389 390 391
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
392
	unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
393
	unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
394 395
	pmd_t *pmd = level2_kernel_pgt;

396 397 398 399 400 401 402 403
	/*
	 * Native path, max_pfn_mapped is not set yet.
	 * Xen has valid max_pfn_mapped set in
	 *	arch/x86/xen/mmu.c:xen_setup_kernel_pagetable().
	 */
	if (max_pfn_mapped)
		vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);

404
	for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
405
		if (pmd_none(*pmd))
406 407 408 409 410 411
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

412 413 414 415
/*
 * Create PTE level page table mapping for physical addresses.
 * It returns the last physical address mapped.
 */
416
static unsigned long __meminit
417
phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
418
	      pgprot_t prot)
419
{
420 421 422
	unsigned long pages = 0, paddr_next;
	unsigned long paddr_last = paddr_end;
	pte_t *pte;
423
	int i;
424

425 426
	pte = pte_page + pte_index(paddr);
	i = pte_index(paddr);
427

428 429 430
	for (; i < PTRS_PER_PTE; i++, paddr = paddr_next, pte++) {
		paddr_next = (paddr & PAGE_MASK) + PAGE_SIZE;
		if (paddr >= paddr_end) {
431
			if (!after_bootmem &&
432
			    !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
433
					     E820_TYPE_RAM) &&
434
			    !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
435
					     E820_TYPE_RESERVED_KERN))
436 437
				set_pte(pte, __pte(0));
			continue;
438 439
		}

440 441 442 443 444 445
		/*
		 * We will re-use the existing mapping.
		 * Xen for example has some special requirements, like mapping
		 * pagetable pages as RO. So assume someone who pre-setup
		 * these mappings are more intelligent.
		 */
446
		if (!pte_none(*pte)) {
J
Jan Beulich 已提交
447 448
			if (!after_bootmem)
				pages++;
449
			continue;
450
		}
451 452

		if (0)
453 454
			pr_info("   pte=%p addr=%lx pte=%016lx\n", pte, paddr,
				pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte);
455
		pages++;
456 457
		set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
		paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE;
458
	}
459

460
	update_page_count(PG_LEVEL_4K, pages);
461

462
	return paddr_last;
463 464
}

465 466 467 468 469
/*
 * Create PMD level page table mapping for physical addresses. The virtual
 * and physical address have to be aligned at this level.
 * It returns the last physical address mapped.
 */
470
static unsigned long __meminit
471
phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
472
	      unsigned long page_size_mask, pgprot_t prot)
473
{
474 475
	unsigned long pages = 0, paddr_next;
	unsigned long paddr_last = paddr_end;
476

477
	int i = pmd_index(paddr);
478

479 480
	for (; i < PTRS_PER_PMD; i++, paddr = paddr_next) {
		pmd_t *pmd = pmd_page + pmd_index(paddr);
481
		pte_t *pte;
482
		pgprot_t new_prot = prot;
483

484 485
		paddr_next = (paddr & PMD_MASK) + PMD_SIZE;
		if (paddr >= paddr_end) {
486
			if (!after_bootmem &&
487
			    !e820__mapped_any(paddr & PMD_MASK, paddr_next,
488
					     E820_TYPE_RAM) &&
489
			    !e820__mapped_any(paddr & PMD_MASK, paddr_next,
490
					     E820_TYPE_RESERVED_KERN))
491 492
				set_pmd(pmd, __pmd(0));
			continue;
493
		}
494

495
		if (!pmd_none(*pmd)) {
496 497
			if (!pmd_large(*pmd)) {
				spin_lock(&init_mm.page_table_lock);
498
				pte = (pte_t *)pmd_page_vaddr(*pmd);
499 500
				paddr_last = phys_pte_init(pte, paddr,
							   paddr_end, prot);
501
				spin_unlock(&init_mm.page_table_lock);
502
				continue;
503
			}
504 505 506 507 508 509 510 511 512 513 514 515
			/*
			 * If we are ok with PG_LEVEL_2M mapping, then we will
			 * use the existing mapping,
			 *
			 * Otherwise, we will split the large page mapping but
			 * use the same existing protection bits except for
			 * large page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
516
			if (page_size_mask & (1 << PG_LEVEL_2M)) {
J
Jan Beulich 已提交
517 518
				if (!after_bootmem)
					pages++;
519
				paddr_last = paddr_next;
520
				continue;
521
			}
522
			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
523 524
		}

525
		if (page_size_mask & (1<<PG_LEVEL_2M)) {
526
			pages++;
527
			spin_lock(&init_mm.page_table_lock);
528
			set_pte((pte_t *)pmd,
529
				pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT,
530
					__pgprot(pgprot_val(prot) | _PAGE_PSE)));
531
			spin_unlock(&init_mm.page_table_lock);
532
			paddr_last = paddr_next;
533
			continue;
534
		}
535

536
		pte = alloc_low_page();
537
		paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot);
538

539
		spin_lock(&init_mm.page_table_lock);
540
		pmd_populate_kernel(&init_mm, pmd, pte);
541
		spin_unlock(&init_mm.page_table_lock);
542
	}
543
	update_page_count(PG_LEVEL_2M, pages);
544
	return paddr_last;
545 546
}

547 548
/*
 * Create PUD level page table mapping for physical addresses. The virtual
549 550
 * and physical address do not have to be aligned at this level. KASLR can
 * randomize virtual addresses up to this level.
551 552
 * It returns the last physical address mapped.
 */
553
static unsigned long __meminit
554 555
phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
	      unsigned long page_size_mask)
T
Thomas Gleixner 已提交
556
{
557 558
	unsigned long pages = 0, paddr_next;
	unsigned long paddr_last = paddr_end;
559 560
	unsigned long vaddr = (unsigned long)__va(paddr);
	int i = pud_index(vaddr);
561

562
	for (; i < PTRS_PER_PUD; i++, paddr = paddr_next) {
563
		pud_t *pud;
L
Linus Torvalds 已提交
564
		pmd_t *pmd;
565
		pgprot_t prot = PAGE_KERNEL;
L
Linus Torvalds 已提交
566

567 568
		vaddr = (unsigned long)__va(paddr);
		pud = pud_page + pud_index(vaddr);
569
		paddr_next = (paddr & PUD_MASK) + PUD_SIZE;
570

571
		if (paddr >= paddr_end) {
572
			if (!after_bootmem &&
573
			    !e820__mapped_any(paddr & PUD_MASK, paddr_next,
574
					     E820_TYPE_RAM) &&
575
			    !e820__mapped_any(paddr & PUD_MASK, paddr_next,
576
					     E820_TYPE_RESERVED_KERN))
577
				set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
578
			continue;
T
Thomas Gleixner 已提交
579
		}
L
Linus Torvalds 已提交
580

581
		if (!pud_none(*pud)) {
582
			if (!pud_large(*pud)) {
583
				pmd = pmd_offset(pud, 0);
584 585 586 587
				paddr_last = phys_pmd_init(pmd, paddr,
							   paddr_end,
							   page_size_mask,
							   prot);
Y
Yinghai Lu 已提交
588
				__flush_tlb_all();
589 590
				continue;
			}
591 592 593 594 595 596 597 598 599 600 601 602
			/*
			 * If we are ok with PG_LEVEL_1G mapping, then we will
			 * use the existing mapping.
			 *
			 * Otherwise, we will split the gbpage mapping but use
			 * the same existing protection  bits except for large
			 * page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
603
			if (page_size_mask & (1 << PG_LEVEL_1G)) {
J
Jan Beulich 已提交
604 605
				if (!after_bootmem)
					pages++;
606
				paddr_last = paddr_next;
607
				continue;
608
			}
609
			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
610 611
		}

612
		if (page_size_mask & (1<<PG_LEVEL_1G)) {
613
			pages++;
614
			spin_lock(&init_mm.page_table_lock);
615
			set_pte((pte_t *)pud,
616
				pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT,
617
					PAGE_KERNEL_LARGE));
618
			spin_unlock(&init_mm.page_table_lock);
619
			paddr_last = paddr_next;
620 621 622
			continue;
		}

623
		pmd = alloc_low_page();
624 625
		paddr_last = phys_pmd_init(pmd, paddr, paddr_end,
					   page_size_mask, prot);
626 627

		spin_lock(&init_mm.page_table_lock);
628
		pud_populate(&init_mm, pud, pmd);
629
		spin_unlock(&init_mm.page_table_lock);
L
Linus Torvalds 已提交
630
	}
A
Andi Kleen 已提交
631
	__flush_tlb_all();
632

633
	update_page_count(PG_LEVEL_1G, pages);
634

635
	return paddr_last;
T
Thomas Gleixner 已提交
636
}
L
Linus Torvalds 已提交
637

638 639 640 641 642 643 644 645
static unsigned long __meminit
phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
	      unsigned long page_size_mask)
{
	unsigned long paddr_next, paddr_last = paddr_end;
	unsigned long vaddr = (unsigned long)__va(paddr);
	int i = p4d_index(vaddr);

646
	if (!pgtable_l5_enabled())
647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688
		return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);

	for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
		p4d_t *p4d;
		pud_t *pud;

		vaddr = (unsigned long)__va(paddr);
		p4d = p4d_page + p4d_index(vaddr);
		paddr_next = (paddr & P4D_MASK) + P4D_SIZE;

		if (paddr >= paddr_end) {
			if (!after_bootmem &&
			    !e820__mapped_any(paddr & P4D_MASK, paddr_next,
					     E820_TYPE_RAM) &&
			    !e820__mapped_any(paddr & P4D_MASK, paddr_next,
					     E820_TYPE_RESERVED_KERN))
				set_p4d(p4d, __p4d(0));
			continue;
		}

		if (!p4d_none(*p4d)) {
			pud = pud_offset(p4d, 0);
			paddr_last = phys_pud_init(pud, paddr,
					paddr_end,
					page_size_mask);
			__flush_tlb_all();
			continue;
		}

		pud = alloc_low_page();
		paddr_last = phys_pud_init(pud, paddr, paddr_end,
					   page_size_mask);

		spin_lock(&init_mm.page_table_lock);
		p4d_populate(&init_mm, p4d, pud);
		spin_unlock(&init_mm.page_table_lock);
	}
	__flush_tlb_all();

	return paddr_last;
}

689 690
/*
 * Create page table mapping for the physical memory for specific physical
691
 * addresses. The virtual and physical addresses have to be aligned on PMD level
692 693
 * down. It returns the last physical address mapped.
 */
694
unsigned long __meminit
695 696
kernel_physical_mapping_init(unsigned long paddr_start,
			     unsigned long paddr_end,
697
			     unsigned long page_size_mask)
T
Thomas Gleixner 已提交
698
{
699
	bool pgd_changed = false;
700
	unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
L
Linus Torvalds 已提交
701

702 703 704 705
	paddr_last = paddr_end;
	vaddr = (unsigned long)__va(paddr_start);
	vaddr_end = (unsigned long)__va(paddr_end);
	vaddr_start = vaddr;
L
Linus Torvalds 已提交
706

707 708
	for (; vaddr < vaddr_end; vaddr = vaddr_next) {
		pgd_t *pgd = pgd_offset_k(vaddr);
709
		p4d_t *p4d;
710

711
		vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
712

713 714 715
		if (pgd_val(*pgd)) {
			p4d = (p4d_t *)pgd_page_vaddr(*pgd);
			paddr_last = phys_p4d_init(p4d, __pa(vaddr),
716 717
						   __pa(vaddr_end),
						   page_size_mask);
718 719 720
			continue;
		}

721 722
		p4d = alloc_low_page();
		paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
723
					   page_size_mask);
724 725

		spin_lock(&init_mm.page_table_lock);
726
		if (pgtable_l5_enabled())
727 728 729
			pgd_populate(&init_mm, pgd, p4d);
		else
			p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
730
		spin_unlock(&init_mm.page_table_lock);
731
		pgd_changed = true;
T
Thomas Gleixner 已提交
732
	}
733 734

	if (pgd_changed)
735
		sync_global_pgds(vaddr_start, vaddr_end - 1);
736

737
	__flush_tlb_all();
L
Linus Torvalds 已提交
738

739
	return paddr_last;
740
}
741

742
#ifndef CONFIG_NUMA
743
void __init initmem_init(void)
744
{
745
	memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0);
746
}
747
#endif
748

L
Linus Torvalds 已提交
749 750
void __init paging_init(void)
{
751
	sparse_memory_present_with_active_regions(MAX_NUMNODES);
752
	sparse_init();
753 754 755 756 757 758 759

	/*
	 * clear the default setting with node 0
	 * note: don't use nodes_clear here, that is really clearing when
	 *	 numa support is not compiled in, and later node_set_state
	 *	 will not set it back.
	 */
760 761 762
	node_clear_state(0, N_MEMORY);
	if (N_MEMORY != N_NORMAL_MEMORY)
		node_clear_state(0, N_NORMAL_MEMORY);
763

764
	zone_sizes_init();
L
Linus Torvalds 已提交
765 766
}

767 768 769
/*
 * Memory hotplug specific functions
 */
770
#ifdef CONFIG_MEMORY_HOTPLUG
771 772 773 774
/*
 * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
 * updating.
 */
775
static void update_end_of_memory_vars(u64 start, u64 size)
776 777 778 779 780 781 782 783 784 785
{
	unsigned long end_pfn = PFN_UP(start + size);

	if (end_pfn > max_pfn) {
		max_pfn = end_pfn;
		max_low_pfn = end_pfn;
		high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
	}
}

786 787
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
		struct vmem_altmap *altmap, bool want_memblock)
788 789 790
{
	int ret;

791
	ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
792
	WARN_ON_ONCE(ret);
793

794
	/* update max_pfn, max_low_pfn and high_memory */
795 796
	update_end_of_memory_vars(start_pfn << PAGE_SHIFT,
				  nr_pages << PAGE_SHIFT);
797

798 799
	return ret;
}
800

801 802
int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
		bool want_memblock)
803 804 805 806 807 808
{
	unsigned long start_pfn = start >> PAGE_SHIFT;
	unsigned long nr_pages = size >> PAGE_SHIFT;

	init_memory_mapping(start, start + size);

809
	return add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
810
}
811

812 813
#define PAGE_INUSE 0xFD

814
static void __meminit free_pagetable(struct page *page, int order)
815 816 817
{
	unsigned long magic;
	unsigned int nr_pages = 1 << order;
818

819 820 821 822
	/* bootmem page has reserved flag */
	if (PageReserved(page)) {
		__ClearPageReserved(page);

823
		magic = (unsigned long)page->freelist;
824 825 826 827
		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
			while (nr_pages--)
				put_page_bootmem(page++);
		} else
828 829
			while (nr_pages--)
				free_reserved_page(page++);
830 831 832 833
	} else
		free_pages((unsigned long)page_address(page), order);
}

834
static void __meminit free_hugepage_table(struct page *page,
835
		struct vmem_altmap *altmap)
836 837 838 839 840 841 842 843
{
	if (altmap)
		vmem_altmap_free(altmap, PMD_SIZE / PAGE_SIZE);
	else
		free_pagetable(page, get_order(PMD_SIZE));
}

static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
844 845 846 847 848 849
{
	pte_t *pte;
	int i;

	for (i = 0; i < PTRS_PER_PTE; i++) {
		pte = pte_start + i;
850
		if (!pte_none(*pte))
851 852 853 854
			return;
	}

	/* free a pte talbe */
855
	free_pagetable(pmd_page(*pmd), 0);
856 857 858 859 860
	spin_lock(&init_mm.page_table_lock);
	pmd_clear(pmd);
	spin_unlock(&init_mm.page_table_lock);
}

861
static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
862 863 864 865 866 867
{
	pmd_t *pmd;
	int i;

	for (i = 0; i < PTRS_PER_PMD; i++) {
		pmd = pmd_start + i;
868
		if (!pmd_none(*pmd))
869 870 871 872
			return;
	}

	/* free a pmd talbe */
873
	free_pagetable(pud_page(*pud), 0);
874 875 876 877 878
	spin_lock(&init_mm.page_table_lock);
	pud_clear(pud);
	spin_unlock(&init_mm.page_table_lock);
}

879
static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
880 881 882 883 884 885 886 887 888 889 890
{
	pud_t *pud;
	int i;

	for (i = 0; i < PTRS_PER_PUD; i++) {
		pud = pud_start + i;
		if (!pud_none(*pud))
			return;
	}

	/* free a pud talbe */
891
	free_pagetable(p4d_page(*p4d), 0);
892 893 894 895 896
	spin_lock(&init_mm.page_table_lock);
	p4d_clear(p4d);
	spin_unlock(&init_mm.page_table_lock);
}

897 898
static void __meminit
remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
899
		 bool direct)
900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923
{
	unsigned long next, pages = 0;
	pte_t *pte;
	void *page_addr;
	phys_addr_t phys_addr;

	pte = pte_start + pte_index(addr);
	for (; addr < end; addr = next, pte++) {
		next = (addr + PAGE_SIZE) & PAGE_MASK;
		if (next > end)
			next = end;

		if (!pte_present(*pte))
			continue;

		/*
		 * We mapped [0,1G) memory as identity mapping when
		 * initializing, in arch/x86/kernel/head_64.S. These
		 * pagetables cannot be removed.
		 */
		phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
		if (phys_addr < (phys_addr_t)0x40000000)
			return;

924
		if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
925 926 927 928 929
			/*
			 * Do not free direct mapping pages since they were
			 * freed when offlining, or simplely not in use.
			 */
			if (!direct)
930
				free_pagetable(pte_page(*pte), 0);
931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952

			spin_lock(&init_mm.page_table_lock);
			pte_clear(&init_mm, addr, pte);
			spin_unlock(&init_mm.page_table_lock);

			/* For non-direct mapping, pages means nothing. */
			pages++;
		} else {
			/*
			 * If we are here, we are freeing vmemmap pages since
			 * direct mapped memory ranges to be freed are aligned.
			 *
			 * If we are not removing the whole page, it means
			 * other page structs in this page are being used and
			 * we canot remove them. So fill the unused page_structs
			 * with 0xFD, and remove the page when it is wholly
			 * filled with 0xFD.
			 */
			memset((void *)addr, PAGE_INUSE, next - addr);

			page_addr = page_address(pte_page(*pte));
			if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
953
				free_pagetable(pte_page(*pte), 0);
954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969

				spin_lock(&init_mm.page_table_lock);
				pte_clear(&init_mm, addr, pte);
				spin_unlock(&init_mm.page_table_lock);
			}
		}
	}

	/* Call free_pte_table() in remove_pmd_table(). */
	flush_tlb_all();
	if (direct)
		update_page_count(PG_LEVEL_4K, -pages);
}

static void __meminit
remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
970
		 bool direct, struct vmem_altmap *altmap)
971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987
{
	unsigned long next, pages = 0;
	pte_t *pte_base;
	pmd_t *pmd;
	void *page_addr;

	pmd = pmd_start + pmd_index(addr);
	for (; addr < end; addr = next, pmd++) {
		next = pmd_addr_end(addr, end);

		if (!pmd_present(*pmd))
			continue;

		if (pmd_large(*pmd)) {
			if (IS_ALIGNED(addr, PMD_SIZE) &&
			    IS_ALIGNED(next, PMD_SIZE)) {
				if (!direct)
988 989
					free_hugepage_table(pmd_page(*pmd),
							    altmap);
990 991 992 993 994 995 996 997 998 999 1000 1001

				spin_lock(&init_mm.page_table_lock);
				pmd_clear(pmd);
				spin_unlock(&init_mm.page_table_lock);
				pages++;
			} else {
				/* If here, we are freeing vmemmap pages. */
				memset((void *)addr, PAGE_INUSE, next - addr);

				page_addr = page_address(pmd_page(*pmd));
				if (!memchr_inv(page_addr, PAGE_INUSE,
						PMD_SIZE)) {
1002 1003
					free_hugepage_table(pmd_page(*pmd),
							    altmap);
1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014

					spin_lock(&init_mm.page_table_lock);
					pmd_clear(pmd);
					spin_unlock(&init_mm.page_table_lock);
				}
			}

			continue;
		}

		pte_base = (pte_t *)pmd_page_vaddr(*pmd);
1015 1016
		remove_pte_table(pte_base, addr, next, direct);
		free_pte_table(pte_base, pmd);
1017 1018 1019 1020 1021 1022 1023 1024 1025
	}

	/* Call free_pmd_table() in remove_pud_table(). */
	if (direct)
		update_page_count(PG_LEVEL_2M, -pages);
}

static void __meminit
remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
1026
		 struct vmem_altmap *altmap, bool direct)
1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044
{
	unsigned long next, pages = 0;
	pmd_t *pmd_base;
	pud_t *pud;
	void *page_addr;

	pud = pud_start + pud_index(addr);
	for (; addr < end; addr = next, pud++) {
		next = pud_addr_end(addr, end);

		if (!pud_present(*pud))
			continue;

		if (pud_large(*pud)) {
			if (IS_ALIGNED(addr, PUD_SIZE) &&
			    IS_ALIGNED(next, PUD_SIZE)) {
				if (!direct)
					free_pagetable(pud_page(*pud),
1045
						       get_order(PUD_SIZE));
1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058

				spin_lock(&init_mm.page_table_lock);
				pud_clear(pud);
				spin_unlock(&init_mm.page_table_lock);
				pages++;
			} else {
				/* If here, we are freeing vmemmap pages. */
				memset((void *)addr, PAGE_INUSE, next - addr);

				page_addr = page_address(pud_page(*pud));
				if (!memchr_inv(page_addr, PAGE_INUSE,
						PUD_SIZE)) {
					free_pagetable(pud_page(*pud),
1059
						       get_order(PUD_SIZE));
1060 1061 1062 1063 1064 1065 1066 1067 1068 1069

					spin_lock(&init_mm.page_table_lock);
					pud_clear(pud);
					spin_unlock(&init_mm.page_table_lock);
				}
			}

			continue;
		}

1070
		pmd_base = pmd_offset(pud, 0);
1071
		remove_pmd_table(pmd_base, addr, next, direct, altmap);
1072
		free_pmd_table(pmd_base, pud);
1073 1074 1075 1076 1077 1078
	}

	if (direct)
		update_page_count(PG_LEVEL_1G, -pages);
}

1079 1080
static void __meminit
remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
1081
		 struct vmem_altmap *altmap, bool direct)
1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095
{
	unsigned long next, pages = 0;
	pud_t *pud_base;
	p4d_t *p4d;

	p4d = p4d_start + p4d_index(addr);
	for (; addr < end; addr = next, p4d++) {
		next = p4d_addr_end(addr, end);

		if (!p4d_present(*p4d))
			continue;

		BUILD_BUG_ON(p4d_large(*p4d));

1096
		pud_base = pud_offset(p4d, 0);
1097
		remove_pud_table(pud_base, addr, next, altmap, direct);
1098 1099 1100 1101 1102
		/*
		 * For 4-level page tables we do not want to free PUDs, but in the
		 * 5-level case we should free them. This code will have to change
		 * to adapt for boot-time switching between 4 and 5 level page tables.
		 */
1103
		if (pgtable_l5_enabled())
1104
			free_pud_table(pud_base, p4d);
1105 1106 1107 1108 1109 1110
	}

	if (direct)
		update_page_count(PG_LEVEL_512G, -pages);
}

1111 1112
/* start and end are both virtual address. */
static void __meminit
1113 1114
remove_pagetable(unsigned long start, unsigned long end, bool direct,
		struct vmem_altmap *altmap)
1115 1116
{
	unsigned long next;
1117
	unsigned long addr;
1118
	pgd_t *pgd;
1119
	p4d_t *p4d;
1120

1121 1122
	for (addr = start; addr < end; addr = next) {
		next = pgd_addr_end(addr, end);
1123

1124
		pgd = pgd_offset_k(addr);
1125 1126 1127
		if (!pgd_present(*pgd))
			continue;

1128
		p4d = p4d_offset(pgd, 0);
1129
		remove_p4d_table(p4d, addr, next, altmap, direct);
1130 1131 1132 1133 1134
	}

	flush_tlb_all();
}

1135 1136
void __ref vmemmap_free(unsigned long start, unsigned long end,
		struct vmem_altmap *altmap)
1137
{
1138
	remove_pagetable(start, end, false, altmap);
1139 1140
}

1141
#ifdef CONFIG_MEMORY_HOTREMOVE
1142 1143 1144 1145 1146 1147
static void __meminit
kernel_physical_mapping_remove(unsigned long start, unsigned long end)
{
	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);

1148
	remove_pagetable(start, end, true, NULL);
1149 1150
}

1151
int __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
1152 1153 1154
{
	unsigned long start_pfn = start >> PAGE_SHIFT;
	unsigned long nr_pages = size >> PAGE_SHIFT;
1155
	struct page *page = pfn_to_page(start_pfn);
1156 1157 1158
	struct zone *zone;
	int ret;

1159 1160 1161 1162
	/* With altmap the first mapped page is offset from @start */
	if (altmap)
		page += vmem_altmap_offset(altmap);
	zone = page_zone(page);
1163
	ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
1164
	WARN_ON_ONCE(ret);
1165
	kernel_physical_mapping_remove(start, start + size);
1166 1167 1168 1169

	return ret;
}
#endif
1170 1171
#endif /* CONFIG_MEMORY_HOTPLUG */

1172
static struct kcore_list kcore_vsyscall;
L
Linus Torvalds 已提交
1173

Y
Yinghai Lu 已提交
1174 1175 1176 1177 1178 1179 1180 1181 1182 1183
static void __init register_page_bootmem_info(void)
{
#ifdef CONFIG_NUMA
	int i;

	for_each_online_node(i)
		register_page_bootmem_info_node(NODE_DATA(i));
#endif
}

L
Linus Torvalds 已提交
1184 1185
void __init mem_init(void)
{
1186
	pci_iommu_alloc();
L
Linus Torvalds 已提交
1187

1188
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
1189

1190
	/* this will put all memory onto the freelists */
1191
	free_all_bootmem();
L
Linus Torvalds 已提交
1192
	after_bootmem = 1;
1193
	x86_init.hyper.init_after_bootmem();
L
Linus Torvalds 已提交
1194

1195 1196 1197 1198 1199 1200 1201 1202
	/*
	 * Must be done after boot memory is put on freelist, because here we
	 * might set fields in deferred struct pages that have not yet been
	 * initialized, and free_all_bootmem() initializes all the reserved
	 * deferred pages for us.
	 */
	register_page_bootmem_info();

L
Linus Torvalds 已提交
1203
	/* Register memory areas for /proc/kcore */
1204 1205
	if (get_gate_vma(&init_mm))
		kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER);
L
Linus Torvalds 已提交
1206

1207
	mem_init_print_info(NULL);
L
Linus Torvalds 已提交
1208 1209
}

1210
int kernel_set_to_readonly;
1211 1212 1213

void set_kernel_text_rw(void)
{
1214
	unsigned long start = PFN_ALIGN(_text);
1215
	unsigned long end = PFN_ALIGN(__stop___ex_table);
1216 1217 1218 1219 1220 1221 1222

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read write\n",
		 start, end);

1223 1224 1225 1226 1227
	/*
	 * Make the kernel identity mapping for text RW. Kernel text
	 * mapping will always be RO. Refer to the comment in
	 * static_protections() in pageattr.c
	 */
1228 1229 1230 1231 1232
	set_memory_rw(start, (end - start) >> PAGE_SHIFT);
}

void set_kernel_text_ro(void)
{
1233
	unsigned long start = PFN_ALIGN(_text);
1234
	unsigned long end = PFN_ALIGN(__stop___ex_table);
1235 1236 1237 1238 1239 1240 1241

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read only\n",
		 start, end);

1242 1243 1244
	/*
	 * Set the kernel identity mapping for text RO.
	 */
1245 1246 1247
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
}

1248 1249
void mark_rodata_ro(void)
{
1250
	unsigned long start = PFN_ALIGN(_text);
1251
	unsigned long rodata_start = PFN_ALIGN(__start_rodata);
1252
	unsigned long end = (unsigned long) &__end_rodata_hpage_align;
1253 1254
	unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
	unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
1255
	unsigned long all_end;
1256

1257
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
1258
	       (end - start) >> 10);
1259 1260
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

1261 1262
	kernel_set_to_readonly = 1;

1263
	/*
1264 1265
	 * The rodata/data/bss/brk section (but not the kernel text!)
	 * should also be not-executable.
1266 1267 1268 1269 1270 1271 1272 1273
	 *
	 * We align all_end to PMD_SIZE because the existing mapping
	 * is a full PMD. If we would align _brk_end to PAGE_SIZE we
	 * split the PMD and the reminder between _brk_end and the end
	 * of the PMD will remain mapped executable.
	 *
	 * Any PMD which was setup after the one which covers _brk_end
	 * has been zapped already via cleanup_highmem().
1274
	 */
1275
	all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
1276
	set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT);
1277

1278
#ifdef CONFIG_CPA_DEBUG
1279
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
1280
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
1281

1282
	printk(KERN_INFO "Testing CPA: again\n");
1283
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
1284
#endif
1285

1286
	free_init_pages("unused kernel",
1287 1288
			(unsigned long) __va(__pa_symbol(text_end)),
			(unsigned long) __va(__pa_symbol(rodata_start)));
1289
	free_init_pages("unused kernel",
1290 1291
			(unsigned long) __va(__pa_symbol(rodata_end)),
			(unsigned long) __va(__pa_symbol(_sdata)));
S
Stephen Smalley 已提交
1292 1293

	debug_checkwx();
1294 1295 1296 1297 1298 1299

	/*
	 * Do this after all of the manipulation of the
	 * kernel text page tables are complete.
	 */
	pti_clone_kernel_text();
1300
}
1301

T
Thomas Gleixner 已提交
1302 1303
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
1304
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
1305
	pgd_t *pgd;
1306
	p4d_t *p4d;
T
Thomas Gleixner 已提交
1307 1308 1309
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
1310 1311

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
1312 1313
		return 0;

L
Linus Torvalds 已提交
1314 1315 1316 1317
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

1318 1319 1320 1321 1322
	p4d = p4d_offset(pgd, addr);
	if (p4d_none(*p4d))
		return 0;

	pud = pud_offset(p4d, addr);
L
Linus Torvalds 已提交
1323
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
1324
		return 0;
L
Linus Torvalds 已提交
1325

1326 1327 1328
	if (pud_large(*pud))
		return pfn_valid(pud_pfn(*pud));

L
Linus Torvalds 已提交
1329 1330 1331
	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
1332

L
Linus Torvalds 已提交
1333 1334 1335 1336 1337 1338
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
1339

L
Linus Torvalds 已提交
1340 1341 1342
	return pfn_valid(pte_pfn(*pte));
}

1343 1344 1345 1346 1347 1348 1349 1350 1351 1352
/*
 * Block size is the minimum amount of memory which can be hotplugged or
 * hotremoved. It must be power of two and must be equal or larger than
 * MIN_MEMORY_BLOCK_SIZE.
 */
#define MAX_BLOCK_SIZE (2UL << 30)

/* Amount of ram needed to start using large blocks */
#define MEM_SIZE_FOR_LARGE_BLOCK (64UL << 30)

1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365
/* Adjustable memory block size */
static unsigned long set_memory_block_size;
int __init set_memory_block_size_order(unsigned int order)
{
	unsigned long size = 1UL << order;

	if (size > MEM_SIZE_FOR_LARGE_BLOCK || size < MIN_MEMORY_BLOCK_SIZE)
		return -EINVAL;

	set_memory_block_size = size;
	return 0;
}

1366
static unsigned long probe_memory_block_size(void)
1367
{
1368 1369
	unsigned long boot_mem_end = max_pfn << PAGE_SHIFT;
	unsigned long bz;
1370

1371 1372 1373
	/* If memory block size has been set, then use it */
	bz = set_memory_block_size;
	if (bz)
1374
		goto done;
1375

1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387
	/* Use regular block if RAM is smaller than MEM_SIZE_FOR_LARGE_BLOCK */
	if (boot_mem_end < MEM_SIZE_FOR_LARGE_BLOCK) {
		bz = MIN_MEMORY_BLOCK_SIZE;
		goto done;
	}

	/* Find the largest allowed block size that aligns to memory end */
	for (bz = MAX_BLOCK_SIZE; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) {
		if (IS_ALIGNED(boot_mem_end, bz))
			break;
	}
done:
1388
	pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20);
1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401

	return bz;
}

static unsigned long memory_block_size_probed;
unsigned long memory_block_size_bytes(void)
{
	if (!memory_block_size_probed)
		memory_block_size_probed = probe_memory_block_size();

	return memory_block_size_probed;
}

1402 1403 1404 1405
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
1406 1407 1408 1409
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

1410
static int __meminit vmemmap_populate_hugepages(unsigned long start,
1411
		unsigned long end, int node, struct vmem_altmap *altmap)
1412
{
1413
	unsigned long addr;
1414 1415
	unsigned long next;
	pgd_t *pgd;
1416
	p4d_t *p4d;
1417 1418 1419
	pud_t *pud;
	pmd_t *pmd;

1420
	for (addr = start; addr < end; addr = next) {
1421
		next = pmd_addr_end(addr, end);
1422 1423 1424 1425

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
1426

1427 1428 1429 1430 1431
		p4d = vmemmap_p4d_populate(pgd, addr, node);
		if (!p4d)
			return -ENOMEM;

		pud = vmemmap_pud_populate(p4d, addr, node);
1432 1433 1434
		if (!pud)
			return -ENOMEM;

1435 1436 1437
		pmd = pmd_offset(pud, addr);
		if (pmd_none(*pmd)) {
			void *p;
T
Thomas Gleixner 已提交
1438

1439 1440 1441 1442
			if (altmap)
				p = altmap_alloc_block_buf(PMD_SIZE, altmap);
			else
				p = vmemmap_alloc_block_buf(PMD_SIZE, node);
1443 1444 1445 1446 1447 1448 1449 1450 1451 1452
			if (p) {
				pte_t entry;

				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
						PAGE_KERNEL_LARGE);
				set_pmd(pmd, __pmd(pte_val(entry)));

				/* check to see if we have contiguous blocks */
				if (p_end != p || node_start != node) {
					if (p_start)
D
Dan Williams 已提交
1453
						pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
1454 1455 1456 1457 1458
						       addr_start, addr_end-1, p_start, p_end-1, node_start);
					addr_start = addr;
					node_start = node;
					p_start = p;
				}
1459

1460 1461 1462
				addr_end = addr + PMD_SIZE;
				p_end = p + PMD_SIZE;
				continue;
1463 1464
			} else if (altmap)
				return -ENOMEM; /* no fallback */
1465
		} else if (pmd_large(*pmd)) {
1466
			vmemmap_verify((pte_t *)pmd, node, addr, next);
1467 1468 1469 1470
			continue;
		}
		if (vmemmap_populate_basepages(addr, next, node))
			return -ENOMEM;
1471 1472 1473
	}
	return 0;
}
1474

1475 1476
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
		struct vmem_altmap *altmap)
1477 1478 1479
{
	int err;

1480
	if (boot_cpu_has(X86_FEATURE_PSE))
1481 1482 1483 1484 1485 1486
		err = vmemmap_populate_hugepages(start, end, node, altmap);
	else if (altmap) {
		pr_err_once("%s: no cpu support for altmap allocations\n",
				__func__);
		err = -ENOMEM;
	} else
1487 1488
		err = vmemmap_populate_basepages(start, end, node);
	if (!err)
1489
		sync_global_pgds(start, end - 1);
1490 1491 1492
	return err;
}

1493 1494
#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
void register_page_bootmem_memmap(unsigned long section_nr,
1495
				  struct page *start_page, unsigned long nr_pages)
1496 1497
{
	unsigned long addr = (unsigned long)start_page;
1498
	unsigned long end = (unsigned long)(start_page + nr_pages);
1499 1500
	unsigned long next;
	pgd_t *pgd;
1501
	p4d_t *p4d;
1502 1503
	pud_t *pud;
	pmd_t *pmd;
1504
	unsigned int nr_pmd_pages;
1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516
	struct page *page;

	for (; addr < end; addr = next) {
		pte_t *pte = NULL;

		pgd = pgd_offset_k(addr);
		if (pgd_none(*pgd)) {
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			continue;
		}
		get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);

1517 1518 1519 1520 1521 1522 1523 1524
		p4d = p4d_offset(pgd, addr);
		if (p4d_none(*p4d)) {
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			continue;
		}
		get_page_bootmem(section_nr, p4d_page(*p4d), MIX_SECTION_INFO);

		pud = pud_offset(p4d, addr);
1525 1526 1527 1528 1529 1530
		if (pud_none(*pud)) {
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			continue;
		}
		get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);

1531
		if (!boot_cpu_has(X86_FEATURE_PSE)) {
1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			pmd = pmd_offset(pud, addr);
			if (pmd_none(*pmd))
				continue;
			get_page_bootmem(section_nr, pmd_page(*pmd),
					 MIX_SECTION_INFO);

			pte = pte_offset_kernel(pmd, addr);
			if (pte_none(*pte))
				continue;
			get_page_bootmem(section_nr, pte_page(*pte),
					 SECTION_INFO);
		} else {
			next = pmd_addr_end(addr, end);

			pmd = pmd_offset(pud, addr);
			if (pmd_none(*pmd))
				continue;

1551
			nr_pmd_pages = 1 << get_order(PMD_SIZE);
1552
			page = pmd_page(*pmd);
1553
			while (nr_pmd_pages--)
1554 1555 1556 1557 1558 1559 1560
				get_page_bootmem(section_nr, page++,
						 SECTION_INFO);
		}
	}
}
#endif

1561 1562 1563
void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
D
Dan Williams 已提交
1564
		pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
1565 1566 1567 1568 1569 1570
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
1571
#endif