init_64.c 23.1 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
P
Pavel Machek 已提交
5
 *  Copyright (C) 2000  Pavel Machek <pavel@ucw.cz>
L
Linus Torvalds 已提交
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
T
Thomas Gleixner 已提交
21
#include <linux/initrd.h>
L
Linus Torvalds 已提交
22 23
#include <linux/pagemap.h>
#include <linux/bootmem.h>
24
#include <linux/memblock.h>
L
Linus Torvalds 已提交
25
#include <linux/proc_fs.h>
26
#include <linux/pci.h>
27
#include <linux/pfn.h>
28
#include <linux/poison.h>
29
#include <linux/dma-mapping.h>
30
#include <linux/module.h>
31
#include <linux/memory.h>
32
#include <linux/memory_hotplug.h>
33
#include <linux/nmi.h>
34
#include <linux/gfp.h>
L
Linus Torvalds 已提交
35 36

#include <asm/processor.h>
37
#include <asm/bios_ebda.h>
L
Linus Torvalds 已提交
38 39 40 41 42 43 44 45 46 47 48
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
49
#include <asm/sections.h>
50
#include <asm/kdebug.h>
51
#include <asm/numa.h>
52
#include <asm/cacheflush.h>
53
#include <asm/init.h>
54
#include <asm/uv/uv.h>
55
#include <asm/setup.h>
L
Linus Torvalds 已提交
56

57 58
#include "mm_internal.h"

I
Ingo Molnar 已提交
59 60 61 62 63 64 65 66 67 68 69 70 71 72
static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
73 74 75 76 77 78
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

79
pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
80 81 82 83
EXPORT_SYMBOL_GPL(__supported_pte_mask);

int force_personality32;

I
Ingo Molnar 已提交
84 85 86 87 88 89 90 91
/*
 * noexec32=on|off
 * Control non executable heap for 32bit processes.
 * To control the stack too use noexec=off
 *
 * on	PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
 * off	PROT_READ implies PROT_EXEC
 */
92 93 94 95 96 97 98 99 100 101
static int __init nonx32_setup(char *str)
{
	if (!strcmp(str, "on"))
		force_personality32 &= ~READ_IMPLIES_EXEC;
	else if (!strcmp(str, "off"))
		force_personality32 |= READ_IMPLIES_EXEC;
	return 1;
}
__setup("noexec32=", nonx32_setup);

102 103 104 105 106 107
/*
 * When memory was added/removed make sure all the processes MM have
 * suitable PGD entries in the local PGD level page.
 */
void sync_global_pgds(unsigned long start, unsigned long end)
{
108 109 110 111 112 113 114 115 116
	unsigned long address;

	for (address = start; address <= end; address += PGDIR_SIZE) {
		const pgd_t *pgd_ref = pgd_offset_k(address);
		struct page *page;

		if (pgd_none(*pgd_ref))
			continue;

A
Andrea Arcangeli 已提交
117
		spin_lock(&pgd_lock);
118 119
		list_for_each_entry(page, &pgd_list, lru) {
			pgd_t *pgd;
120 121
			spinlock_t *pgt_lock;

122
			pgd = (pgd_t *)page_address(page) + pgd_index(address);
A
Andrea Arcangeli 已提交
123
			/* the pgt_lock only for Xen */
124 125 126
			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
			spin_lock(pgt_lock);

127 128 129 130 131
			if (pgd_none(*pgd))
				set_pgd(pgd, *pgd_ref);
			else
				BUG_ON(pgd_page_vaddr(*pgd)
				       != pgd_page_vaddr(*pgd_ref));
132 133

			spin_unlock(pgt_lock);
134
		}
A
Andrea Arcangeli 已提交
135
		spin_unlock(&pgd_lock);
136
	}
137 138
}

139 140 141 142 143
/*
 * NOTE: This function is marked __ref because it calls __init function
 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
 */
static __ref void *spp_getpage(void)
T
Thomas Gleixner 已提交
144
{
L
Linus Torvalds 已提交
145
	void *ptr;
T
Thomas Gleixner 已提交
146

L
Linus Torvalds 已提交
147
	if (after_bootmem)
148
		ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
L
Linus Torvalds 已提交
149 150
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
151 152 153 154 155

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
156

157
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
158

L
Linus Torvalds 已提交
159
	return ptr;
T
Thomas Gleixner 已提交
160
}
L
Linus Torvalds 已提交
161

162
static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
L
Linus Torvalds 已提交
163
{
164 165 166 167 168 169 170 171 172
	if (pgd_none(*pgd)) {
		pud_t *pud = (pud_t *)spp_getpage();
		pgd_populate(&init_mm, pgd, pud);
		if (pud != pud_offset(pgd, 0))
			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
			       pud, pud_offset(pgd, 0));
	}
	return pud_offset(pgd, vaddr);
}
L
Linus Torvalds 已提交
173

174
static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
175
{
L
Linus Torvalds 已提交
176
	if (pud_none(*pud)) {
177
		pmd_t *pmd = (pmd_t *) spp_getpage();
178
		pud_populate(&init_mm, pud, pmd);
179
		if (pmd != pmd_offset(pud, 0))
180
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
181
			       pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
182
	}
183 184 185
	return pmd_offset(pud, vaddr);
}

186
static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
187
{
L
Linus Torvalds 已提交
188
	if (pmd_none(*pmd)) {
189
		pte_t *pte = (pte_t *) spp_getpage();
190
		pmd_populate_kernel(&init_mm, pmd, pte);
191
		if (pte != pte_offset_kernel(pmd, 0))
192
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
193
	}
194 195 196 197 198 199 200 201 202 203 204 205
	return pte_offset_kernel(pmd, vaddr);
}

void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
{
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	pud = pud_page + pud_index(vaddr);
	pmd = fill_pmd(pud, vaddr);
	pte = fill_pte(pmd, vaddr);
L
Linus Torvalds 已提交
206 207 208 209 210 211 212 213 214 215

	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

216
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232
{
	pgd_t *pgd;
	pud_t *pud_page;

	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
		return;
	}
	pud_page = (pud_t*)pgd_page_vaddr(*pgd);
	set_pte_vaddr_pud(pud_page, vaddr, pteval);
}

233
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
234 235 236 237 238
{
	pgd_t *pgd;
	pud_t *pud;

	pgd = pgd_offset_k(vaddr);
239 240 241 242 243 244 245
	pud = fill_pud(pgd, vaddr);
	return fill_pmd(pud, vaddr);
}

pte_t * __init populate_extra_pte(unsigned long vaddr)
{
	pmd_t *pmd;
246

247 248
	pmd = populate_extra_pmd(vaddr);
	return fill_pte(pmd, vaddr);
249 250
}

251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290
/*
 * Create large page table mappings for a range of physical addresses.
 */
static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
						pgprot_t prot)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
		pgd = pgd_offset_k((unsigned long)__va(phys));
		if (pgd_none(*pgd)) {
			pud = (pud_t *) spp_getpage();
			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pud = pud_offset(pgd, (unsigned long)__va(phys));
		if (pud_none(*pud)) {
			pmd = (pmd_t *) spp_getpage();
			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pmd = pmd_offset(pud, phys);
		BUG_ON(!pmd_none(*pmd));
		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
	}
}

void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
}

void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
}

291
/*
292 293 294
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
295 296 297 298 299
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
300 301
 * We limit the mappings to the region from _text to _brk_end.  _brk_end
 * is rounded up to the 2MB boundary. This catches the invalid pmds as
302 303 304 305 306
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
307 308
	unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
	unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
309 310
	pmd_t *pmd = level2_kernel_pgt;

311
	for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
312
		if (pmd_none(*pmd))
313 314 315 316 317 318
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

319
static unsigned long __meminit
320 321
phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
	      pgprot_t prot)
322
{
323
	unsigned long pages = 0, next;
324
	unsigned long last_map_addr = end;
325
	int i;
326

327 328
	pte_t *pte = pte_page + pte_index(addr);

329 330
	for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) {
		next = (addr & PAGE_MASK) + PAGE_SIZE;
331
		if (addr >= end) {
332 333 334 335 336
			if (!after_bootmem &&
			    !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) &&
			    !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN))
				set_pte(pte, __pte(0));
			continue;
337 338
		}

339 340 341 342 343 344
		/*
		 * We will re-use the existing mapping.
		 * Xen for example has some special requirements, like mapping
		 * pagetable pages as RO. So assume someone who pre-setup
		 * these mappings are more intelligent.
		 */
345
		if (pte_val(*pte)) {
J
Jan Beulich 已提交
346 347
			if (!after_bootmem)
				pages++;
348
			continue;
349
		}
350 351 352 353 354

		if (0)
			printk("   pte=%p addr=%lx pte=%016lx\n",
			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
		pages++;
355
		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
356
		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
357
	}
358

359
	update_page_count(PG_LEVEL_4K, pages);
360 361

	return last_map_addr;
362 363
}

364
static unsigned long __meminit
365
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
366
	      unsigned long page_size_mask, pgprot_t prot)
367
{
368
	unsigned long pages = 0, next;
369
	unsigned long last_map_addr = end;
370

371
	int i = pmd_index(address);
372

373
	for (; i < PTRS_PER_PMD; i++, address = next) {
374
		pmd_t *pmd = pmd_page + pmd_index(address);
375
		pte_t *pte;
376
		pgprot_t new_prot = prot;
377

378
		next = (address & PMD_MASK) + PMD_SIZE;
379
		if (address >= end) {
380 381 382 383 384
			if (!after_bootmem &&
			    !e820_any_mapped(address & PMD_MASK, next, E820_RAM) &&
			    !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN))
				set_pmd(pmd, __pmd(0));
			continue;
385
		}
386

387
		if (pmd_val(*pmd)) {
388 389
			if (!pmd_large(*pmd)) {
				spin_lock(&init_mm.page_table_lock);
390
				pte = (pte_t *)pmd_page_vaddr(*pmd);
Y
Yinghai Lu 已提交
391
				last_map_addr = phys_pte_init(pte, address,
392
								end, prot);
393
				spin_unlock(&init_mm.page_table_lock);
394
				continue;
395
			}
396 397 398 399 400 401 402 403 404 405 406 407
			/*
			 * If we are ok with PG_LEVEL_2M mapping, then we will
			 * use the existing mapping,
			 *
			 * Otherwise, we will split the large page mapping but
			 * use the same existing protection bits except for
			 * large page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
408
			if (page_size_mask & (1 << PG_LEVEL_2M)) {
J
Jan Beulich 已提交
409 410
				if (!after_bootmem)
					pages++;
411
				last_map_addr = next;
412
				continue;
413
			}
414
			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
415 416
		}

417
		if (page_size_mask & (1<<PG_LEVEL_2M)) {
418
			pages++;
419
			spin_lock(&init_mm.page_table_lock);
420
			set_pte((pte_t *)pmd,
421
				pfn_pte((address & PMD_MASK) >> PAGE_SHIFT,
422
					__pgprot(pgprot_val(prot) | _PAGE_PSE)));
423
			spin_unlock(&init_mm.page_table_lock);
424
			last_map_addr = next;
425
			continue;
426
		}
427

428
		pte = alloc_low_page();
429
		last_map_addr = phys_pte_init(pte, address, end, new_prot);
430

431
		spin_lock(&init_mm.page_table_lock);
432
		pmd_populate_kernel(&init_mm, pmd, pte);
433
		spin_unlock(&init_mm.page_table_lock);
434
	}
435
	update_page_count(PG_LEVEL_2M, pages);
436
	return last_map_addr;
437 438
}

439
static unsigned long __meminit
440 441
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
			 unsigned long page_size_mask)
T
Thomas Gleixner 已提交
442
{
443
	unsigned long pages = 0, next;
444
	unsigned long last_map_addr = end;
445
	int i = pud_index(addr);
446

447
	for (; i < PTRS_PER_PUD; i++, addr = next) {
448
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
449
		pmd_t *pmd;
450
		pgprot_t prot = PAGE_KERNEL;
L
Linus Torvalds 已提交
451

452
		next = (addr & PUD_MASK) + PUD_SIZE;
453 454 455 456 457
		if (addr >= end) {
			if (!after_bootmem &&
			    !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) &&
			    !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN))
				set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
458
			continue;
T
Thomas Gleixner 已提交
459
		}
L
Linus Torvalds 已提交
460

461
		if (pud_val(*pud)) {
462
			if (!pud_large(*pud)) {
463
				pmd = pmd_offset(pud, 0);
Y
Yinghai Lu 已提交
464
				last_map_addr = phys_pmd_init(pmd, addr, end,
465
							 page_size_mask, prot);
Y
Yinghai Lu 已提交
466
				__flush_tlb_all();
467 468
				continue;
			}
469 470 471 472 473 474 475 476 477 478 479 480
			/*
			 * If we are ok with PG_LEVEL_1G mapping, then we will
			 * use the existing mapping.
			 *
			 * Otherwise, we will split the gbpage mapping but use
			 * the same existing protection  bits except for large
			 * page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
481
			if (page_size_mask & (1 << PG_LEVEL_1G)) {
J
Jan Beulich 已提交
482 483
				if (!after_bootmem)
					pages++;
484
				last_map_addr = next;
485
				continue;
486
			}
487
			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
488 489
		}

490
		if (page_size_mask & (1<<PG_LEVEL_1G)) {
491
			pages++;
492
			spin_lock(&init_mm.page_table_lock);
493
			set_pte((pte_t *)pud,
494 495
				pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT,
					PAGE_KERNEL_LARGE));
496
			spin_unlock(&init_mm.page_table_lock);
497
			last_map_addr = next;
498 499 500
			continue;
		}

501
		pmd = alloc_low_page();
502 503
		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
					      prot);
504 505

		spin_lock(&init_mm.page_table_lock);
506
		pud_populate(&init_mm, pud, pmd);
507
		spin_unlock(&init_mm.page_table_lock);
L
Linus Torvalds 已提交
508
	}
A
Andi Kleen 已提交
509
	__flush_tlb_all();
510

511
	update_page_count(PG_LEVEL_1G, pages);
512

513
	return last_map_addr;
T
Thomas Gleixner 已提交
514
}
L
Linus Torvalds 已提交
515

516
unsigned long __meminit
517 518 519
kernel_physical_mapping_init(unsigned long start,
			     unsigned long end,
			     unsigned long page_size_mask)
T
Thomas Gleixner 已提交
520
{
521
	bool pgd_changed = false;
522
	unsigned long next, last_map_addr = end;
523
	unsigned long addr;
L
Linus Torvalds 已提交
524 525 526

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);
527
	addr = start;
L
Linus Torvalds 已提交
528 529

	for (; start < end; start = next) {
530 531 532
		pgd_t *pgd = pgd_offset_k(start);
		pud_t *pud;

533
		next = (start + PGDIR_SIZE) & PGDIR_MASK;
534 535 536 537
		if (next > end)
			next = end;

		if (pgd_val(*pgd)) {
538
			pud = (pud_t *)pgd_page_vaddr(*pgd);
Y
Yinghai Lu 已提交
539
			last_map_addr = phys_pud_init(pud, __pa(start),
540
						 __pa(end), page_size_mask);
541 542 543
			continue;
		}

544
		pud = alloc_low_page();
545 546
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
						 page_size_mask);
547 548

		spin_lock(&init_mm.page_table_lock);
549
		pgd_populate(&init_mm, pgd, pud);
550
		spin_unlock(&init_mm.page_table_lock);
551
		pgd_changed = true;
T
Thomas Gleixner 已提交
552
	}
553 554 555 556

	if (pgd_changed)
		sync_global_pgds(addr, end);

557
	__flush_tlb_all();
L
Linus Torvalds 已提交
558

559 560
	return last_map_addr;
}
561

562
#ifndef CONFIG_NUMA
563
void __init initmem_init(void)
564
{
T
Tejun Heo 已提交
565
	memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
566
}
567
#endif
568

L
Linus Torvalds 已提交
569 570
void __init paging_init(void)
{
571
	sparse_memory_present_with_active_regions(MAX_NUMNODES);
572
	sparse_init();
573 574 575 576 577 578 579

	/*
	 * clear the default setting with node 0
	 * note: don't use nodes_clear here, that is really clearing when
	 *	 numa support is not compiled in, and later node_set_state
	 *	 will not set it back.
	 */
580 581 582
	node_clear_state(0, N_MEMORY);
	if (N_MEMORY != N_NORMAL_MEMORY)
		node_clear_state(0, N_NORMAL_MEMORY);
583

584
	zone_sizes_init();
L
Linus Torvalds 已提交
585 586
}

587 588 589
/*
 * Memory hotplug specific functions
 */
590
#ifdef CONFIG_MEMORY_HOTPLUG
591 592 593 594 595 596 597 598 599 600 601 602 603 604 605
/*
 * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
 * updating.
 */
static void  update_end_of_memory_vars(u64 start, u64 size)
{
	unsigned long end_pfn = PFN_UP(start + size);

	if (end_pfn > max_pfn) {
		max_pfn = end_pfn;
		max_low_pfn = end_pfn;
		high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
	}
}

606 607 608 609
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
610
int arch_add_memory(int nid, u64 start, u64 size)
611
{
612
	struct pglist_data *pgdat = NODE_DATA(nid);
613
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
614
	unsigned long start_pfn = start >> PAGE_SHIFT;
615 616 617
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

618
	init_memory_mapping(start, start + size);
619

620
	ret = __add_pages(nid, zone, start_pfn, nr_pages);
621
	WARN_ON_ONCE(ret);
622

623 624 625
	/* update max_pfn, max_low_pfn and high_memory */
	update_end_of_memory_vars(start, size);

626 627
	return ret;
}
628
EXPORT_SYMBOL_GPL(arch_add_memory);
629

630 631
#endif /* CONFIG_MEMORY_HOTPLUG */

632
static struct kcore_list kcore_vsyscall;
L
Linus Torvalds 已提交
633

Y
Yinghai Lu 已提交
634 635 636 637 638 639 640 641 642 643
static void __init register_page_bootmem_info(void)
{
#ifdef CONFIG_NUMA
	int i;

	for_each_online_node(i)
		register_page_bootmem_info_node(NODE_DATA(i));
#endif
}

L
Linus Torvalds 已提交
644 645
void __init mem_init(void)
{
646
	long codesize, reservedpages, datasize, initsize;
647
	unsigned long absent_pages;
L
Linus Torvalds 已提交
648

649
	pci_iommu_alloc();
L
Linus Torvalds 已提交
650

651
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
652 653 654 655

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
Y
Yinghai Lu 已提交
656
	register_page_bootmem_info();
657
	totalram_pages = free_all_bootmem();
658 659 660

	absent_pages = absent_pages_in_range(0, max_pfn);
	reservedpages = max_pfn - totalram_pages - absent_pages;
L
Linus Torvalds 已提交
661 662 663 664 665 666 667
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
668
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
K
KAMEZAWA Hiroyuki 已提交
669
			 VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
L
Linus Torvalds 已提交
670

671
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
672
			 "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
673
		nr_free_pages() << (PAGE_SHIFT-10),
Y
Yinghai Lu 已提交
674
		max_pfn << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
675
		codesize >> 10,
676
		absent_pages << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
677 678 679 680 681
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
}

682
#ifdef CONFIG_DEBUG_RODATA
683 684
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
685

686
int kernel_set_to_readonly;
687 688 689

void set_kernel_text_rw(void)
{
690
	unsigned long start = PFN_ALIGN(_text);
691
	unsigned long end = PFN_ALIGN(__stop___ex_table);
692 693 694 695 696 697 698

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read write\n",
		 start, end);

699 700 701 702 703
	/*
	 * Make the kernel identity mapping for text RW. Kernel text
	 * mapping will always be RO. Refer to the comment in
	 * static_protections() in pageattr.c
	 */
704 705 706 707 708
	set_memory_rw(start, (end - start) >> PAGE_SHIFT);
}

void set_kernel_text_ro(void)
{
709
	unsigned long start = PFN_ALIGN(_text);
710
	unsigned long end = PFN_ALIGN(__stop___ex_table);
711 712 713 714 715 716 717

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read only\n",
		 start, end);

718 719 720
	/*
	 * Set the kernel identity mapping for text RO.
	 */
721 722 723
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
}

724 725
void mark_rodata_ro(void)
{
726
	unsigned long start = PFN_ALIGN(_text);
727 728
	unsigned long rodata_start =
		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
729 730 731 732
	unsigned long end = (unsigned long) &__end_rodata_hpage_align;
	unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
	unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
	unsigned long data_start = (unsigned long) &_sdata;
733

734
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
735
	       (end - start) >> 10);
736 737
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

738 739
	kernel_set_to_readonly = 1;

740 741 742 743
	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
744
	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
745

746 747
	rodata_test();

748
#ifdef CONFIG_CPA_DEBUG
749
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
750
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
751

752
	printk(KERN_INFO "Testing CPA: again\n");
753
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
754
#endif
755 756 757 758 759 760 761 762

	free_init_pages("unused kernel memory",
			(unsigned long) page_address(virt_to_page(text_end)),
			(unsigned long)
				 page_address(virt_to_page(rodata_start)));
	free_init_pages("unused kernel memory",
			(unsigned long) page_address(virt_to_page(rodata_end)),
			(unsigned long) page_address(virt_to_page(data_start)));
763
}
764

765 766
#endif

T
Thomas Gleixner 已提交
767 768
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
769
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
770 771 772 773
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
774 775

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
776 777
		return 0;

L
Linus Torvalds 已提交
778 779 780 781 782 783
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
784
		return 0;
L
Linus Torvalds 已提交
785 786 787 788

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
789

L
Linus Torvalds 已提交
790 791 792 793 794 795
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
796

L
Linus Torvalds 已提交
797 798 799
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
800 801 802 803 804
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
805
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
806 807 808 809
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
810 811
};

812
struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
L
Linus Torvalds 已提交
813 814
{
#ifdef CONFIG_IA32_EMULATION
815
	if (!mm || mm->context.ia32_compat)
816
		return NULL;
L
Linus Torvalds 已提交
817 818 819 820
#endif
	return &gate_vma;
}

821
int in_gate_area(struct mm_struct *mm, unsigned long addr)
L
Linus Torvalds 已提交
822
{
823
	struct vm_area_struct *vma = get_gate_vma(mm);
T
Thomas Gleixner 已提交
824

825 826
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
827

L
Linus Torvalds 已提交
828 829 830
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
831
/*
832 833 834
 * Use this when you have no reliable mm, typically from interrupt
 * context. It is less reliable than using a task's mm and may give
 * false positives.
L
Linus Torvalds 已提交
835
 */
836
int in_gate_area_no_mm(unsigned long addr)
L
Linus Torvalds 已提交
837
{
838
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
839
}
840

841 842 843 844 845 846 847 848
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
849

850 851 852 853 854 855 856 857 858 859 860
#ifdef CONFIG_X86_UV
unsigned long memory_block_size_bytes(void)
{
	if (is_uv_system()) {
		printk(KERN_INFO "UV: memory block size 2GB\n");
		return 2UL * 1024 * 1024 * 1024;
	}
	return MIN_MEMORY_BLOCK_SIZE;
}
#endif

861 862 863 864
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
865 866 867 868
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

T
Thomas Gleixner 已提交
869 870
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
871 872 873 874 875 876 877 878 879
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
880
		void *p = NULL;
881 882 883 884

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
885

886 887 888 889
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

890 891 892 893 894 895 896 897
		if (!cpu_has_pse) {
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			pmd = vmemmap_pmd_populate(pud, addr, node);

			if (!pmd)
				return -ENOMEM;

			p = vmemmap_pte_populate(pmd, addr, node);
T
Thomas Gleixner 已提交
898

899 900 901
			if (!p)
				return -ENOMEM;

902 903
			addr_end = addr + PAGE_SIZE;
			p_end = p + PAGE_SIZE;
T
Thomas Gleixner 已提交
904
		} else {
905 906 907 908 909 910
			next = pmd_addr_end(addr, end);

			pmd = pmd_offset(pud, addr);
			if (pmd_none(*pmd)) {
				pte_t entry;

911
				p = vmemmap_alloc_block_buf(PMD_SIZE, node);
912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927
				if (!p)
					return -ENOMEM;

				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
						PAGE_KERNEL_LARGE);
				set_pmd(pmd, __pmd(pte_val(entry)));

				/* check to see if we have contiguous blocks */
				if (p_end != p || node_start != node) {
					if (p_start)
						printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						       addr_start, addr_end-1, p_start, p_end-1, node_start);
					addr_start = addr;
					node_start = node;
					p_start = p;
				}
Y
Yinghai Lu 已提交
928 929 930

				addr_end = addr + PMD_SIZE;
				p_end = p + PMD_SIZE;
931 932
			} else
				vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
933
		}
934

935
	}
936
	sync_global_pgds((unsigned long)start_page, end);
937 938
	return 0;
}
939 940 941 942 943 944 945 946 947 948 949

void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
950
#endif