init_64.c 22.2 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
24
#include <linux/pci.h>
25
#include <linux/pfn.h>
26
#include <linux/poison.h>
27
#include <linux/dma-mapping.h>
28 29
#include <linux/module.h>
#include <linux/memory_hotplug.h>
30
#include <linux/nmi.h>
L
Linus Torvalds 已提交
31 32 33 34 35 36 37 38 39 40 41 42 43 44

#include <asm/processor.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
45
#include <asm/sections.h>
46
#include <asm/kdebug.h>
47
#include <asm/numa.h>
48
#include <asm/cacheflush.h>
L
Linus Torvalds 已提交
49

T
Thomas Gleixner 已提交
50
const struct dma_mapping_ops *dma_ops;
51 52
EXPORT_SYMBOL(dma_ops);

53 54
static unsigned long dma_reserve __initdata;

L
Linus Torvalds 已提交
55 56
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);

I
Ingo Molnar 已提交
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
int direct_gbpages __meminitdata
#ifdef CONFIG_DIRECT_GBPAGES
				= 1
#endif
;

static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
77 78 79 80 81 82 83 84
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

void show_mem(void)
{
85 86
	long i, total = 0, reserved = 0;
	long shared = 0, cached = 0;
L
Linus Torvalds 已提交
87
	struct page *page;
T
Thomas Gleixner 已提交
88
	pg_data_t *pgdat;
L
Linus Torvalds 已提交
89

90
	printk(KERN_INFO "Mem-info:\n");
L
Linus Torvalds 已提交
91
	show_free_areas();
92
	for_each_online_pgdat(pgdat) {
T
Thomas Gleixner 已提交
93 94 95 96 97 98
		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
			/*
			 * This loop can take a while with 256 GB and
			 * 4k pages so defer the NMI watchdog:
			 */
			if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
99
				touch_nmi_watchdog();
T
Thomas Gleixner 已提交
100

B
Bob Picco 已提交
101 102
			if (!pfn_valid(pgdat->node_start_pfn + i))
				continue;
T
Thomas Gleixner 已提交
103

L
Linus Torvalds 已提交
104 105
			page = pfn_to_page(pgdat->node_start_pfn + i);
			total++;
106 107 108 109 110 111
			if (PageReserved(page))
				reserved++;
			else if (PageSwapCache(page))
				cached++;
			else if (page_count(page))
				shared += page_count(page) - 1;
T
Thomas Gleixner 已提交
112
		}
L
Linus Torvalds 已提交
113
	}
T
Thomas Gleixner 已提交
114 115 116 117
	printk(KERN_INFO "%lu pages of RAM\n",		total);
	printk(KERN_INFO "%lu reserved pages\n",	reserved);
	printk(KERN_INFO "%lu pages shared\n",		shared);
	printk(KERN_INFO "%lu pages swap cached\n",	cached);
L
Linus Torvalds 已提交
118 119 120 121
}

int after_bootmem;

122
static __init void *spp_getpage(void)
T
Thomas Gleixner 已提交
123
{
L
Linus Torvalds 已提交
124
	void *ptr;
T
Thomas Gleixner 已提交
125

L
Linus Torvalds 已提交
126
	if (after_bootmem)
T
Thomas Gleixner 已提交
127
		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
L
Linus Torvalds 已提交
128 129
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
130 131 132 133 134

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
135

136
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
137

L
Linus Torvalds 已提交
138
	return ptr;
T
Thomas Gleixner 已提交
139
}
L
Linus Torvalds 已提交
140

T
Thomas Gleixner 已提交
141 142
static __init void
set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
L
Linus Torvalds 已提交
143 144 145 146 147 148
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte, new_pte;

149
	pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
L
Linus Torvalds 已提交
150 151 152

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
153 154
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
L
Linus Torvalds 已提交
155 156 157 158
		return;
	}
	pud = pud_offset(pgd, vaddr);
	if (pud_none(*pud)) {
T
Thomas Gleixner 已提交
159
		pmd = (pmd_t *) spp_getpage();
L
Linus Torvalds 已提交
160 161
		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
		if (pmd != pmd_offset(pud, 0)) {
162
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
T
Thomas Gleixner 已提交
163
				pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
164 165 166 167 168 169 170 171
			return;
		}
	}
	pmd = pmd_offset(pud, vaddr);
	if (pmd_none(*pmd)) {
		pte = (pte_t *) spp_getpage();
		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
		if (pte != pte_offset_kernel(pmd, 0)) {
172
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
			return;
		}
	}
	new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);

	pte = pte_offset_kernel(pmd, vaddr);
	if (!pte_none(*pte) &&
	    pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
		pte_ERROR(*pte);
	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

191
/*
192 193 194
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
 * We limit the mappings to the region from _text to _end.  _end is
 * rounded up to the 2MB boundary. This catches the invalid pmds as
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
	unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
	pmd_t *pmd = level2_kernel_pgt;
	pmd_t *last_pmd = pmd + PTRS_PER_PMD;

	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
		if (!pmd_present(*pmd))
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

L
Linus Torvalds 已提交
219
/* NOTE: this is meant to be run only at boot */
T
Thomas Gleixner 已提交
220 221
void __init
__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
L
Linus Torvalds 已提交
222 223 224 225
{
	unsigned long address = __fix_to_virt(idx);

	if (idx >= __end_of_fixed_addresses) {
226
		printk(KERN_ERR "Invalid __set_fixmap\n");
L
Linus Torvalds 已提交
227 228 229 230 231
		return;
	}
	set_pte_phys(address, phys, prot);
}

232 233
static unsigned long __initdata table_start;
static unsigned long __meminitdata table_end;
L
Linus Torvalds 已提交
234

235
static __meminit void *alloc_low_page(unsigned long *phys)
T
Thomas Gleixner 已提交
236
{
237
	unsigned long pfn = table_end++;
L
Linus Torvalds 已提交
238 239
	void *adr;

240 241 242
	if (after_bootmem) {
		adr = (void *)get_zeroed_page(GFP_ATOMIC);
		*phys = __pa(adr);
T
Thomas Gleixner 已提交
243

244 245 246
		return adr;
	}

T
Thomas Gleixner 已提交
247 248
	if (pfn >= end_pfn)
		panic("alloc_low_page: ran out of memory");
249 250

	adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
251
	memset(adr, 0, PAGE_SIZE);
252 253 254
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
L
Linus Torvalds 已提交
255

256
static __meminit void unmap_low_page(void *adr)
T
Thomas Gleixner 已提交
257
{
258 259 260
	if (after_bootmem)
		return;

261
	early_iounmap(adr, PAGE_SIZE);
T
Thomas Gleixner 已提交
262
}
L
Linus Torvalds 已提交
263

264
/* Must run before zap_low_mappings */
265
__meminit void *early_ioremap(unsigned long addr, unsigned long size)
266
{
267
	pmd_t *pmd, *last_pmd;
T
Thomas Gleixner 已提交
268
	unsigned long vaddr;
269 270 271 272 273 274
	int i, pmds;

	pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
	vaddr = __START_KERNEL_map;
	pmd = level2_kernel_pgt;
	last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
T
Thomas Gleixner 已提交
275

276 277 278
	for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
		for (i = 0; i < pmds; i++) {
			if (pmd_present(pmd[i]))
T
Thomas Gleixner 已提交
279
				goto continue_outer_loop;
280 281 282
		}
		vaddr += addr & ~PMD_MASK;
		addr &= PMD_MASK;
T
Thomas Gleixner 已提交
283

284
		for (i = 0; i < pmds; i++, addr += PMD_SIZE)
285
			set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
A
Andi Kleen 已提交
286
		__flush_tlb_all();
T
Thomas Gleixner 已提交
287

288
		return (void *)vaddr;
T
Thomas Gleixner 已提交
289
continue_outer_loop:
290
		;
291
	}
292
	printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size);
T
Thomas Gleixner 已提交
293

294
	return NULL;
295 296
}

T
Thomas Gleixner 已提交
297 298 299
/*
 * To avoid virtual aliases later:
 */
300
__meminit void early_iounmap(void *addr, unsigned long size)
301
{
302 303 304 305 306 307 308
	unsigned long vaddr;
	pmd_t *pmd;
	int i, pmds;

	vaddr = (unsigned long)addr;
	pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
	pmd = level2_kernel_pgt + pmd_index(vaddr);
T
Thomas Gleixner 已提交
309

310 311
	for (i = 0; i < pmds; i++)
		pmd_clear(pmd + i);
T
Thomas Gleixner 已提交
312

A
Andi Kleen 已提交
313
	__flush_tlb_all();
314 315
}

316
static unsigned long __meminit
317
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
318
{
319
	int i = pmd_index(address);
320

321 322
	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
		pmd_t *pmd = pmd_page + pmd_index(address);
323

324
		if (address >= end) {
T
Thomas Gleixner 已提交
325
			if (!after_bootmem) {
326 327
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
T
Thomas Gleixner 已提交
328
			}
329 330
			break;
		}
331 332 333 334

		if (pmd_val(*pmd))
			continue;

335 336
		set_pte((pte_t *)pmd,
			pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
337
	}
338
	return address;
339 340
}

341
static unsigned long __meminit
342 343
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
{
T
Thomas Gleixner 已提交
344
	pmd_t *pmd = pmd_offset(pud, 0);
345 346
	unsigned long last_map_addr;

347
	spin_lock(&init_mm.page_table_lock);
348
	last_map_addr = phys_pmd_init(pmd, address, end);
349 350
	spin_unlock(&init_mm.page_table_lock);
	__flush_tlb_all();
351
	return last_map_addr;
352 353
}

354
static unsigned long __meminit
T
Thomas Gleixner 已提交
355 356
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
{
357
	unsigned long last_map_addr = end;
358
	int i = pud_index(addr);
359

T
Thomas Gleixner 已提交
360
	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
361 362
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
363 364
		pmd_t *pmd;

365
		if (addr >= end)
L
Linus Torvalds 已提交
366 367
			break;

T
Thomas Gleixner 已提交
368 369 370
		if (!after_bootmem &&
				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
			set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
371
			continue;
T
Thomas Gleixner 已提交
372
		}
L
Linus Torvalds 已提交
373

374
		if (pud_val(*pud)) {
375
			if (!pud_large(*pud))
376
				last_map_addr = phys_pmd_update(pud, addr, end);
377 378 379 380 381 382
			continue;
		}

		if (direct_gbpages) {
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
383
			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
384 385 386
			continue;
		}

387
		pmd = alloc_low_page(&pmd_phys);
T
Thomas Gleixner 已提交
388

389
		spin_lock(&init_mm.page_table_lock);
L
Linus Torvalds 已提交
390
		set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
391
		last_map_addr = phys_pmd_init(pmd, addr, end);
392
		spin_unlock(&init_mm.page_table_lock);
T
Thomas Gleixner 已提交
393

394
		unmap_low_page(pmd);
L
Linus Torvalds 已提交
395
	}
A
Andi Kleen 已提交
396
	__flush_tlb_all();
397 398

	return last_map_addr >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
399
}
L
Linus Torvalds 已提交
400 401 402

static void __init find_early_table_space(unsigned long end)
{
403
	unsigned long puds, pmds, tables, start;
L
Linus Torvalds 已提交
404 405

	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
406 407 408 409 410
	tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
	if (!direct_gbpages) {
		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
		tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
	}
L
Linus Torvalds 已提交
411

T
Thomas Gleixner 已提交
412 413 414 415 416 417
	/*
	 * RED-PEN putting page tables only on node 0 could
	 * cause a hotspot and fill up ZONE_DMA. The page tables
	 * need roughly 0.5KB per GB.
	 */
	start = 0x8000;
418
	table_start = find_e820_area(start, end, tables, PAGE_SIZE);
L
Linus Torvalds 已提交
419 420 421 422 423
	if (table_start == -1UL)
		panic("Cannot find space for the kernel page tables");

	table_start >>= PAGE_SHIFT;
	table_end = table_start;
424 425

	early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
426 427
		end, table_start << PAGE_SHIFT,
		(table_start << PAGE_SHIFT) + tables);
L
Linus Torvalds 已提交
428 429
}

430 431 432 433 434 435 436 437
static void __init init_gbpages(void)
{
	if (direct_gbpages && cpu_has_gbpages)
		printk(KERN_INFO "Using GB pages for direct mapping\n");
	else
		direct_gbpages = 0;
}

Y
Yinghai Lu 已提交
438 439 440 441
#ifdef CONFIG_MEMTEST_BOOTPARAM

static void __init memtest(unsigned long start_phys, unsigned long size,
				 unsigned pattern)
Y
Yinghai Lu 已提交
442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483
{
	unsigned long i;
	unsigned long *start;
	unsigned long start_bad;
	unsigned long last_bad;
	unsigned long val;
	unsigned long start_phys_aligned;
	unsigned long count;
	unsigned long incr;

	switch (pattern) {
	case 0:
		val = 0UL;
		break;
	case 1:
		val = -1UL;
		break;
	case 2:
		val = 0x5555555555555555UL;
		break;
	case 3:
		val = 0xaaaaaaaaaaaaaaaaUL;
		break;
	default:
		return;
	}

	incr = sizeof(unsigned long);
	start_phys_aligned = ALIGN(start_phys, incr);
	count = (size - (start_phys_aligned - start_phys))/incr;
	start = __va(start_phys_aligned);
	start_bad = 0;
	last_bad = 0;

	for (i = 0; i < count; i++)
		start[i] = val;
	for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
		if (*start != val) {
			if (start_phys_aligned == last_bad + incr) {
				last_bad += incr;
			} else {
				if (start_bad) {
Y
Yinghai Lu 已提交
484
					printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
Y
Yinghai Lu 已提交
485 486 487 488 489 490 491 492
						val, start_bad, last_bad + incr);
					reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
				}
				start_bad = last_bad = start_phys_aligned;
			}
		}
	}
	if (start_bad) {
Y
Yinghai Lu 已提交
493
		printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
Y
Yinghai Lu 已提交
494 495 496 497 498 499
			val, start_bad, last_bad + incr);
		reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
	}

}

Y
Yinghai Lu 已提交
500 501
static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;

Y
Yinghai Lu 已提交
502 503 504
static int __init parse_memtest(char *arg)
{
	if (arg)
Y
Yinghai Lu 已提交
505
		memtest_pattern = simple_strtoul(arg, NULL, 0);
Y
Yinghai Lu 已提交
506 507 508 509 510 511 512 513 514 515
	return 0;
}

early_param("memtest", parse_memtest);

static void __init early_memtest(unsigned long start, unsigned long end)
{
	unsigned long t_start, t_size;
	unsigned pattern;

Y
Yinghai Lu 已提交
516 517 518 519
	if (!memtest_pattern)
		return;

	printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
Y
Yinghai Lu 已提交
520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539
	for (pattern = 0; pattern < memtest_pattern; pattern++) {
		t_start = start;
		t_size = 0;
		while (t_start < end) {
			t_start = find_e820_area_size(t_start, &t_size, 1);

			/* done ? */
			if (t_start >= end)
				break;
			if (t_start + t_size > end)
				t_size = end - t_start;

			printk(KERN_CONT "\n  %016lx - %016lx pattern %d",
				t_start, t_start + t_size, pattern);

			memtest(t_start, t_size, pattern);

			t_start += t_size;
		}
	}
Y
Yinghai Lu 已提交
540
	printk(KERN_CONT "\n");
Y
Yinghai Lu 已提交
541
}
Y
Yinghai Lu 已提交
542 543 544 545 546
#else
static void __init early_memtest(unsigned long start, unsigned long end)
{
}
#endif
Y
Yinghai Lu 已提交
547

T
Thomas Gleixner 已提交
548 549 550 551 552
/*
 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
 * This runs before bootmem is initialized and gets pages directly from
 * the physical memory. To access them they are temporarily mapped.
 */
553
unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
T
Thomas Gleixner 已提交
554
{
555
	unsigned long next, last_map_addr = end;
Y
Yinghai Lu 已提交
556
	unsigned long start_phys = start, end_phys = end;
L
Linus Torvalds 已提交
557

Y
Yinghai Lu 已提交
558
	printk(KERN_INFO "init_memory_mapping\n");
L
Linus Torvalds 已提交
559

T
Thomas Gleixner 已提交
560
	/*
L
Linus Torvalds 已提交
561
	 * Find space for the kernel direct mapping tables.
T
Thomas Gleixner 已提交
562 563 564 565
	 *
	 * Later we should allocate these tables in the local node of the
	 * memory mapped. Unfortunately this is done currently before the
	 * nodes are discovered.
L
Linus Torvalds 已提交
566
	 */
567 568
	if (!after_bootmem) {
		init_gbpages();
569
		find_early_table_space(end);
570
	}
L
Linus Torvalds 已提交
571 572 573 574 575

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);

	for (; start < end; start = next) {
576
		pgd_t *pgd = pgd_offset_k(start);
T
Thomas Gleixner 已提交
577
		unsigned long pud_phys;
578 579 580
		pud_t *pud;

		if (after_bootmem)
581
			pud = pud_offset(pgd, start & PGDIR_MASK);
582
		else
583
			pud = alloc_low_page(&pud_phys);
584

L
Linus Torvalds 已提交
585
		next = start + PGDIR_SIZE;
T
Thomas Gleixner 已提交
586 587
		if (next > end)
			next = end;
588
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
589 590
		if (!after_bootmem)
			set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
591
		unmap_low_page(pud);
T
Thomas Gleixner 已提交
592
	}
L
Linus Torvalds 已提交
593

594
	if (!after_bootmem)
595
		mmu_cr4_features = read_cr4();
L
Linus Torvalds 已提交
596
	__flush_tlb_all();
597

598 599 600
	if (!after_bootmem)
		reserve_early(table_start << PAGE_SHIFT,
				 table_end << PAGE_SHIFT, "PGTABLE");
Y
Yinghai Lu 已提交
601 602 603

	if (!after_bootmem)
		early_memtest(start_phys, end_phys);
604 605

	return last_map_addr;
L
Linus Torvalds 已提交
606 607
}

608
#ifndef CONFIG_NUMA
L
Linus Torvalds 已提交
609 610
void __init paging_init(void)
{
611
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
612

613 614 615 616 617
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
	max_zone_pfns[ZONE_NORMAL] = end_pfn;

618 619
	memory_present(0, 0, end_pfn);
	sparse_init();
620
	free_area_init_nodes(max_zone_pfns);
L
Linus Torvalds 已提交
621 622 623
}
#endif

624 625 626 627 628 629
/*
 * Memory hotplug specific functions
 */
void online_page(struct page *page)
{
	ClearPageReserved(page);
630
	init_page_count(page);
631 632 633 634 635
	__free_page(page);
	totalram_pages++;
	num_physpages++;
}

636
#ifdef CONFIG_MEMORY_HOTPLUG
637 638 639 640
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
641
int arch_add_memory(int nid, u64 start, u64 size)
642
{
643
	struct pglist_data *pgdat = NODE_DATA(nid);
644
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
645
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
646 647 648
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

649 650 651
	last_mapped_pfn = init_memory_mapping(start, start + size-1);
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
652

653
	ret = __add_pages(zone, start_pfn, nr_pages);
654
	WARN_ON(1);
655 656 657

	return ret;
}
658
EXPORT_SYMBOL_GPL(arch_add_memory);
659

660
#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
661 662 663 664
int memory_add_physaddr_to_nid(u64 start)
{
	return 0;
}
665
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
666 667
#endif

668 669
#endif /* CONFIG_MEMORY_HOTPLUG */

T
Thomas Gleixner 已提交
670 671
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
			 kcore_modules, kcore_vsyscall;
L
Linus Torvalds 已提交
672 673 674

void __init mem_init(void)
{
675
	long codesize, reservedpages, datasize, initsize;
L
Linus Torvalds 已提交
676

677
	pci_iommu_alloc();
L
Linus Torvalds 已提交
678

679
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
680 681 682 683

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
684
#ifdef CONFIG_NUMA
685
	totalram_pages = numa_free_all_bootmem();
L
Linus Torvalds 已提交
686
#else
687
	totalram_pages = free_all_bootmem();
L
Linus Torvalds 已提交
688
#endif
689 690
	reservedpages = end_pfn - totalram_pages -
					absent_pages_in_range(0, end_pfn);
L
Linus Torvalds 已提交
691 692 693 694 695 696 697
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
698 699
	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
L
Linus Torvalds 已提交
700 701 702
		   VMALLOC_END-VMALLOC_START);
	kclist_add(&kcore_kernel, &_stext, _end - _stext);
	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
T
Thomas Gleixner 已提交
703
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
L
Linus Torvalds 已提交
704 705
				 VSYSCALL_END - VSYSCALL_START);

706
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
T
Thomas Gleixner 已提交
707
				"%ldk reserved, %ldk data, %ldk init)\n",
L
Linus Torvalds 已提交
708 709 710 711 712 713
		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
		end_pfn << (PAGE_SHIFT-10),
		codesize >> 10,
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
714 715

	cpa_init();
L
Linus Torvalds 已提交
716 717
}

718
void free_init_pages(char *what, unsigned long begin, unsigned long end)
L
Linus Torvalds 已提交
719
{
720
	unsigned long addr = begin;
L
Linus Torvalds 已提交
721

722
	if (addr >= end)
723 724
		return;

I
Ingo Molnar 已提交
725 726 727 728 729 730 731 732 733 734
	/*
	 * If debugging page accesses then do not free this memory but
	 * mark them not present - any buggy init-section access will
	 * create a kernel page fault:
	 */
#ifdef CONFIG_DEBUG_PAGEALLOC
	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
		begin, PAGE_ALIGN(end));
	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
#else
735
	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
T
Thomas Gleixner 已提交
736

737
	for (; addr < end; addr += PAGE_SIZE) {
738 739 740 741 742
		ClearPageReserved(virt_to_page(addr));
		init_page_count(virt_to_page(addr));
		memset((void *)(addr & ~(PAGE_SIZE-1)),
			POISON_FREE_INITMEM, PAGE_SIZE);
		free_page(addr);
L
Linus Torvalds 已提交
743 744
		totalram_pages++;
	}
I
Ingo Molnar 已提交
745
#endif
746 747 748 749 750
}

void free_initmem(void)
{
	free_init_pages("unused kernel memory",
751 752
			(unsigned long)(&__init_begin),
			(unsigned long)(&__init_end));
L
Linus Torvalds 已提交
753 754
}

755
#ifdef CONFIG_DEBUG_RODATA
756 757
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
758 759 760

void mark_rodata_ro(void)
{
761
	unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
762

763
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
764
	       (end - start) >> 10);
765 766 767 768 769 770 771 772
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
	start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
	set_memory_nx(start, (end - start) >> PAGE_SHIFT);
773

774 775
	rodata_test();

776
#ifdef CONFIG_CPA_DEBUG
777
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
778
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
779

780
	printk(KERN_INFO "Testing CPA: again\n");
781
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
782
#endif
783
}
784

785 786
#endif

L
Linus Torvalds 已提交
787 788 789
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
790
	free_init_pages("initrd memory", start, end);
L
Linus Torvalds 已提交
791 792 793
}
#endif

T
Thomas Gleixner 已提交
794 795
void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
{
796
#ifdef CONFIG_NUMA
L
Linus Torvalds 已提交
797
	int nid = phys_to_nid(phys);
798 799
#endif
	unsigned long pfn = phys >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
800

801
	if (pfn >= end_pfn) {
T
Thomas Gleixner 已提交
802 803 804 805
		/*
		 * This can happen with kdump kernels when accessing
		 * firmware tables:
		 */
806
		if (pfn < max_pfn_mapped)
807
			return;
T
Thomas Gleixner 已提交
808

809 810 811 812 813 814 815
		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
				phys, len);
		return;
	}

	/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
816
	reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
T
Thomas Gleixner 已提交
817
#else
818
	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
L
Linus Torvalds 已提交
819
#endif
820
	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
821
		dma_reserve += len / PAGE_SIZE;
822 823
		set_dma_reserve(dma_reserve);
	}
L
Linus Torvalds 已提交
824 825
}

T
Thomas Gleixner 已提交
826 827
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
828
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
829 830 831 832
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
833 834

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
835 836
		return 0;

L
Linus Torvalds 已提交
837 838 839 840 841 842
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
843
		return 0;
L
Linus Torvalds 已提交
844 845 846 847

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
848

L
Linus Torvalds 已提交
849 850 851 852 853 854
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
855

L
Linus Torvalds 已提交
856 857 858
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
859 860 861 862 863
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
864
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
865 866 867 868
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
869 870 871 872 873
};

struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
#ifdef CONFIG_IA32_EMULATION
874 875
	if (test_tsk_thread_flag(tsk, TIF_IA32))
		return NULL;
L
Linus Torvalds 已提交
876 877 878 879 880 881 882
#endif
	return &gate_vma;
}

int in_gate_area(struct task_struct *task, unsigned long addr)
{
	struct vm_area_struct *vma = get_gate_vma(task);
T
Thomas Gleixner 已提交
883

884 885
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
886

L
Linus Torvalds 已提交
887 888 889
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
890 891 892 893
/*
 * Use this when you have no reliable task/vma, typically from interrupt
 * context. It is less reliable than using the task's vma and may give
 * false positives:
L
Linus Torvalds 已提交
894 895 896
 */
int in_gate_area_no_task(unsigned long addr)
{
897
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
898
}
899

900 901 902 903 904 905 906 907
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
908 909 910 911 912

#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
T
Thomas Gleixner 已提交
913 914
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
915 916 917 918 919 920 921 922 923 924 925 926 927 928
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
		next = pmd_addr_end(addr, end);

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
929

930 931 932 933 934 935 936
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

		pmd = pmd_offset(pud, addr);
		if (pmd_none(*pmd)) {
			pte_t entry;
T
Thomas Gleixner 已提交
937 938 939
			void *p;

			p = vmemmap_alloc_block(PMD_SIZE, node);
940 941 942
			if (!p)
				return -ENOMEM;

T
Thomas Gleixner 已提交
943 944
			entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
							PAGE_KERNEL_LARGE);
945 946 947 948
			set_pmd(pmd, __pmd(pte_val(entry)));

			printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
				addr, addr + PMD_SIZE - 1, p, node);
T
Thomas Gleixner 已提交
949
		} else {
950
			vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
951
		}
952 953 954 955
	}
	return 0;
}
#endif