init_64.c 23.5 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
24
#include <linux/pci.h>
25
#include <linux/pfn.h>
26
#include <linux/poison.h>
27
#include <linux/dma-mapping.h>
28 29
#include <linux/module.h>
#include <linux/memory_hotplug.h>
30
#include <linux/nmi.h>
L
Linus Torvalds 已提交
31 32 33 34 35 36 37 38 39 40 41 42 43 44

#include <asm/processor.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
45
#include <asm/sections.h>
46
#include <asm/kdebug.h>
47
#include <asm/numa.h>
48
#include <asm/cacheflush.h>
L
Linus Torvalds 已提交
49

50 51
static unsigned long dma_reserve __initdata;

L
Linus Torvalds 已提交
52 53
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);

I
Ingo Molnar 已提交
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
int direct_gbpages __meminitdata
#ifdef CONFIG_DIRECT_GBPAGES
				= 1
#endif
;

static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
74 75 76 77 78 79 80 81
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

void show_mem(void)
{
82 83
	long i, total = 0, reserved = 0;
	long shared = 0, cached = 0;
L
Linus Torvalds 已提交
84
	struct page *page;
T
Thomas Gleixner 已提交
85
	pg_data_t *pgdat;
L
Linus Torvalds 已提交
86

87
	printk(KERN_INFO "Mem-info:\n");
L
Linus Torvalds 已提交
88
	show_free_areas();
89
	for_each_online_pgdat(pgdat) {
T
Thomas Gleixner 已提交
90 91 92 93 94 95
		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
			/*
			 * This loop can take a while with 256 GB and
			 * 4k pages so defer the NMI watchdog:
			 */
			if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
96
				touch_nmi_watchdog();
T
Thomas Gleixner 已提交
97

B
Bob Picco 已提交
98 99
			if (!pfn_valid(pgdat->node_start_pfn + i))
				continue;
T
Thomas Gleixner 已提交
100

L
Linus Torvalds 已提交
101 102
			page = pfn_to_page(pgdat->node_start_pfn + i);
			total++;
103 104 105 106 107 108
			if (PageReserved(page))
				reserved++;
			else if (PageSwapCache(page))
				cached++;
			else if (page_count(page))
				shared += page_count(page) - 1;
T
Thomas Gleixner 已提交
109
		}
L
Linus Torvalds 已提交
110
	}
T
Thomas Gleixner 已提交
111 112 113 114
	printk(KERN_INFO "%lu pages of RAM\n",		total);
	printk(KERN_INFO "%lu reserved pages\n",	reserved);
	printk(KERN_INFO "%lu pages shared\n",		shared);
	printk(KERN_INFO "%lu pages swap cached\n",	cached);
L
Linus Torvalds 已提交
115 116 117 118
}

int after_bootmem;

119
static __init void *spp_getpage(void)
T
Thomas Gleixner 已提交
120
{
L
Linus Torvalds 已提交
121
	void *ptr;
T
Thomas Gleixner 已提交
122

L
Linus Torvalds 已提交
123
	if (after_bootmem)
T
Thomas Gleixner 已提交
124
		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
L
Linus Torvalds 已提交
125 126
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
127 128 129 130 131

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
132

133
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
134

L
Linus Torvalds 已提交
135
	return ptr;
T
Thomas Gleixner 已提交
136
}
L
Linus Torvalds 已提交
137

I
Ingo Molnar 已提交
138
static void
T
Thomas Gleixner 已提交
139
set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
L
Linus Torvalds 已提交
140 141 142 143 144 145
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte, new_pte;

146
	pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
L
Linus Torvalds 已提交
147 148 149

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
150 151
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
L
Linus Torvalds 已提交
152 153 154 155
		return;
	}
	pud = pud_offset(pgd, vaddr);
	if (pud_none(*pud)) {
T
Thomas Gleixner 已提交
156
		pmd = (pmd_t *) spp_getpage();
L
Linus Torvalds 已提交
157 158
		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
		if (pmd != pmd_offset(pud, 0)) {
159
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
T
Thomas Gleixner 已提交
160
				pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
161 162 163 164 165 166 167 168
			return;
		}
	}
	pmd = pmd_offset(pud, vaddr);
	if (pmd_none(*pmd)) {
		pte = (pte_t *) spp_getpage();
		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
		if (pte != pte_offset_kernel(pmd, 0)) {
169
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
170 171 172 173 174 175
			return;
		}
	}
	new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);

	pte = pte_offset_kernel(pmd, vaddr);
I
Ingo Molnar 已提交
176
	if (!pte_none(*pte) && pte_val(new_pte) &&
L
Linus Torvalds 已提交
177 178 179 180 181 182 183 184 185 186 187
	    pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
		pte_ERROR(*pte);
	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

188
/*
189 190 191
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
 * We limit the mappings to the region from _text to _end.  _end is
 * rounded up to the 2MB boundary. This catches the invalid pmds as
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
	unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
	pmd_t *pmd = level2_kernel_pgt;
	pmd_t *last_pmd = pmd + PTRS_PER_PMD;

	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
		if (!pmd_present(*pmd))
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

L
Linus Torvalds 已提交
216
/* NOTE: this is meant to be run only at boot */
I
Ingo Molnar 已提交
217
void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
L
Linus Torvalds 已提交
218 219 220 221
{
	unsigned long address = __fix_to_virt(idx);

	if (idx >= __end_of_fixed_addresses) {
222
		printk(KERN_ERR "Invalid __set_fixmap\n");
L
Linus Torvalds 已提交
223 224 225 226 227
		return;
	}
	set_pte_phys(address, phys, prot);
}

228 229
static unsigned long __initdata table_start;
static unsigned long __meminitdata table_end;
L
Linus Torvalds 已提交
230

231
static __meminit void *alloc_low_page(unsigned long *phys)
T
Thomas Gleixner 已提交
232
{
233
	unsigned long pfn = table_end++;
L
Linus Torvalds 已提交
234 235
	void *adr;

236 237 238
	if (after_bootmem) {
		adr = (void *)get_zeroed_page(GFP_ATOMIC);
		*phys = __pa(adr);
T
Thomas Gleixner 已提交
239

240 241 242
		return adr;
	}

T
Thomas Gleixner 已提交
243 244
	if (pfn >= end_pfn)
		panic("alloc_low_page: ran out of memory");
245 246

	adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
247
	memset(adr, 0, PAGE_SIZE);
248 249 250
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
L
Linus Torvalds 已提交
251

252
static __meminit void unmap_low_page(void *adr)
T
Thomas Gleixner 已提交
253
{
254 255 256
	if (after_bootmem)
		return;

257
	early_iounmap(adr, PAGE_SIZE);
T
Thomas Gleixner 已提交
258
}
L
Linus Torvalds 已提交
259

260
/* Must run before zap_low_mappings */
261
__meminit void *early_ioremap(unsigned long addr, unsigned long size)
262
{
263
	pmd_t *pmd, *last_pmd;
T
Thomas Gleixner 已提交
264
	unsigned long vaddr;
265 266 267 268 269 270
	int i, pmds;

	pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
	vaddr = __START_KERNEL_map;
	pmd = level2_kernel_pgt;
	last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
T
Thomas Gleixner 已提交
271

272 273 274
	for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
		for (i = 0; i < pmds; i++) {
			if (pmd_present(pmd[i]))
T
Thomas Gleixner 已提交
275
				goto continue_outer_loop;
276 277 278
		}
		vaddr += addr & ~PMD_MASK;
		addr &= PMD_MASK;
T
Thomas Gleixner 已提交
279

280
		for (i = 0; i < pmds; i++, addr += PMD_SIZE)
281
			set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
A
Andi Kleen 已提交
282
		__flush_tlb_all();
T
Thomas Gleixner 已提交
283

284
		return (void *)vaddr;
T
Thomas Gleixner 已提交
285
continue_outer_loop:
286
		;
287
	}
288
	printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size);
T
Thomas Gleixner 已提交
289

290
	return NULL;
291 292
}

T
Thomas Gleixner 已提交
293 294 295
/*
 * To avoid virtual aliases later:
 */
296
__meminit void early_iounmap(void *addr, unsigned long size)
297
{
298 299 300 301 302 303 304
	unsigned long vaddr;
	pmd_t *pmd;
	int i, pmds;

	vaddr = (unsigned long)addr;
	pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
	pmd = level2_kernel_pgt + pmd_index(vaddr);
T
Thomas Gleixner 已提交
305

306 307
	for (i = 0; i < pmds; i++)
		pmd_clear(pmd + i);
T
Thomas Gleixner 已提交
308

A
Andi Kleen 已提交
309
	__flush_tlb_all();
310 311
}

312
static unsigned long __meminit
313
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
314
{
315
	int i = pmd_index(address);
316

317 318
	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
		pmd_t *pmd = pmd_page + pmd_index(address);
319

320
		if (address >= end) {
T
Thomas Gleixner 已提交
321
			if (!after_bootmem) {
322 323
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
T
Thomas Gleixner 已提交
324
			}
325 326
			break;
		}
327 328 329 330

		if (pmd_val(*pmd))
			continue;

331 332
		set_pte((pte_t *)pmd,
			pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
333
	}
334
	return address;
335 336
}

337
static unsigned long __meminit
338 339
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
{
T
Thomas Gleixner 已提交
340
	pmd_t *pmd = pmd_offset(pud, 0);
341 342
	unsigned long last_map_addr;

343
	spin_lock(&init_mm.page_table_lock);
344
	last_map_addr = phys_pmd_init(pmd, address, end);
345 346
	spin_unlock(&init_mm.page_table_lock);
	__flush_tlb_all();
347
	return last_map_addr;
348 349
}

350
static unsigned long __meminit
T
Thomas Gleixner 已提交
351 352
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
{
353
	unsigned long last_map_addr = end;
354
	int i = pud_index(addr);
355

T
Thomas Gleixner 已提交
356
	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
357 358
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
359 360
		pmd_t *pmd;

361
		if (addr >= end)
L
Linus Torvalds 已提交
362 363
			break;

T
Thomas Gleixner 已提交
364 365 366
		if (!after_bootmem &&
				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
			set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
367
			continue;
T
Thomas Gleixner 已提交
368
		}
L
Linus Torvalds 已提交
369

370
		if (pud_val(*pud)) {
371
			if (!pud_large(*pud))
372
				last_map_addr = phys_pmd_update(pud, addr, end);
373 374 375 376 377 378
			continue;
		}

		if (direct_gbpages) {
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
379
			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
380 381 382
			continue;
		}

383
		pmd = alloc_low_page(&pmd_phys);
T
Thomas Gleixner 已提交
384

385
		spin_lock(&init_mm.page_table_lock);
L
Linus Torvalds 已提交
386
		set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
387
		last_map_addr = phys_pmd_init(pmd, addr, end);
388
		spin_unlock(&init_mm.page_table_lock);
T
Thomas Gleixner 已提交
389

390
		unmap_low_page(pmd);
L
Linus Torvalds 已提交
391
	}
A
Andi Kleen 已提交
392
	__flush_tlb_all();
393 394

	return last_map_addr >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
395
}
L
Linus Torvalds 已提交
396 397 398

static void __init find_early_table_space(unsigned long end)
{
399
	unsigned long puds, pmds, tables, start;
L
Linus Torvalds 已提交
400 401

	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
402 403 404 405 406
	tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
	if (!direct_gbpages) {
		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
		tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
	}
L
Linus Torvalds 已提交
407

T
Thomas Gleixner 已提交
408 409 410 411 412 413
	/*
	 * RED-PEN putting page tables only on node 0 could
	 * cause a hotspot and fill up ZONE_DMA. The page tables
	 * need roughly 0.5KB per GB.
	 */
	start = 0x8000;
414
	table_start = find_e820_area(start, end, tables, PAGE_SIZE);
L
Linus Torvalds 已提交
415 416 417 418 419
	if (table_start == -1UL)
		panic("Cannot find space for the kernel page tables");

	table_start >>= PAGE_SHIFT;
	table_end = table_start;
420 421

	early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
422 423
		end, table_start << PAGE_SHIFT,
		(table_start << PAGE_SHIFT) + tables);
L
Linus Torvalds 已提交
424 425
}

426 427 428 429 430 431 432 433
static void __init init_gbpages(void)
{
	if (direct_gbpages && cpu_has_gbpages)
		printk(KERN_INFO "Using GB pages for direct mapping\n");
	else
		direct_gbpages = 0;
}

Y
Yinghai Lu 已提交
434 435 436 437
#ifdef CONFIG_MEMTEST_BOOTPARAM

static void __init memtest(unsigned long start_phys, unsigned long size,
				 unsigned pattern)
Y
Yinghai Lu 已提交
438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479
{
	unsigned long i;
	unsigned long *start;
	unsigned long start_bad;
	unsigned long last_bad;
	unsigned long val;
	unsigned long start_phys_aligned;
	unsigned long count;
	unsigned long incr;

	switch (pattern) {
	case 0:
		val = 0UL;
		break;
	case 1:
		val = -1UL;
		break;
	case 2:
		val = 0x5555555555555555UL;
		break;
	case 3:
		val = 0xaaaaaaaaaaaaaaaaUL;
		break;
	default:
		return;
	}

	incr = sizeof(unsigned long);
	start_phys_aligned = ALIGN(start_phys, incr);
	count = (size - (start_phys_aligned - start_phys))/incr;
	start = __va(start_phys_aligned);
	start_bad = 0;
	last_bad = 0;

	for (i = 0; i < count; i++)
		start[i] = val;
	for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
		if (*start != val) {
			if (start_phys_aligned == last_bad + incr) {
				last_bad += incr;
			} else {
				if (start_bad) {
Y
Yinghai Lu 已提交
480
					printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
Y
Yinghai Lu 已提交
481 482 483 484 485 486 487 488
						val, start_bad, last_bad + incr);
					reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
				}
				start_bad = last_bad = start_phys_aligned;
			}
		}
	}
	if (start_bad) {
Y
Yinghai Lu 已提交
489
		printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
Y
Yinghai Lu 已提交
490 491 492 493 494 495
			val, start_bad, last_bad + incr);
		reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
	}

}

Y
Yinghai Lu 已提交
496 497
static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;

Y
Yinghai Lu 已提交
498 499 500
static int __init parse_memtest(char *arg)
{
	if (arg)
Y
Yinghai Lu 已提交
501
		memtest_pattern = simple_strtoul(arg, NULL, 0);
Y
Yinghai Lu 已提交
502 503 504 505 506 507 508 509 510 511
	return 0;
}

early_param("memtest", parse_memtest);

static void __init early_memtest(unsigned long start, unsigned long end)
{
	unsigned long t_start, t_size;
	unsigned pattern;

Y
Yinghai Lu 已提交
512 513 514 515
	if (!memtest_pattern)
		return;

	printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
Y
Yinghai Lu 已提交
516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535
	for (pattern = 0; pattern < memtest_pattern; pattern++) {
		t_start = start;
		t_size = 0;
		while (t_start < end) {
			t_start = find_e820_area_size(t_start, &t_size, 1);

			/* done ? */
			if (t_start >= end)
				break;
			if (t_start + t_size > end)
				t_size = end - t_start;

			printk(KERN_CONT "\n  %016lx - %016lx pattern %d",
				t_start, t_start + t_size, pattern);

			memtest(t_start, t_size, pattern);

			t_start += t_size;
		}
	}
Y
Yinghai Lu 已提交
536
	printk(KERN_CONT "\n");
Y
Yinghai Lu 已提交
537
}
Y
Yinghai Lu 已提交
538 539 540 541 542
#else
static void __init early_memtest(unsigned long start, unsigned long end)
{
}
#endif
Y
Yinghai Lu 已提交
543

T
Thomas Gleixner 已提交
544 545 546 547 548
/*
 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
 * This runs before bootmem is initialized and gets pages directly from
 * the physical memory. To access them they are temporarily mapped.
 */
549
unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
T
Thomas Gleixner 已提交
550
{
551
	unsigned long next, last_map_addr = end;
Y
Yinghai Lu 已提交
552
	unsigned long start_phys = start, end_phys = end;
L
Linus Torvalds 已提交
553

Y
Yinghai Lu 已提交
554
	printk(KERN_INFO "init_memory_mapping\n");
L
Linus Torvalds 已提交
555

T
Thomas Gleixner 已提交
556
	/*
L
Linus Torvalds 已提交
557
	 * Find space for the kernel direct mapping tables.
T
Thomas Gleixner 已提交
558 559 560 561
	 *
	 * Later we should allocate these tables in the local node of the
	 * memory mapped. Unfortunately this is done currently before the
	 * nodes are discovered.
L
Linus Torvalds 已提交
562
	 */
563 564
	if (!after_bootmem) {
		init_gbpages();
565
		find_early_table_space(end);
566
	}
L
Linus Torvalds 已提交
567 568 569 570 571

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);

	for (; start < end; start = next) {
572
		pgd_t *pgd = pgd_offset_k(start);
T
Thomas Gleixner 已提交
573
		unsigned long pud_phys;
574 575 576
		pud_t *pud;

		if (after_bootmem)
577
			pud = pud_offset(pgd, start & PGDIR_MASK);
578
		else
579
			pud = alloc_low_page(&pud_phys);
580

L
Linus Torvalds 已提交
581
		next = start + PGDIR_SIZE;
T
Thomas Gleixner 已提交
582 583
		if (next > end)
			next = end;
584
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
585 586
		if (!after_bootmem)
			set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
587
		unmap_low_page(pud);
T
Thomas Gleixner 已提交
588
	}
L
Linus Torvalds 已提交
589

590
	if (!after_bootmem)
591
		mmu_cr4_features = read_cr4();
L
Linus Torvalds 已提交
592
	__flush_tlb_all();
593

594 595 596
	if (!after_bootmem)
		reserve_early(table_start << PAGE_SHIFT,
				 table_end << PAGE_SHIFT, "PGTABLE");
Y
Yinghai Lu 已提交
597 598 599

	if (!after_bootmem)
		early_memtest(start_phys, end_phys);
600 601

	return last_map_addr;
L
Linus Torvalds 已提交
602 603
}

604
#ifndef CONFIG_NUMA
L
Linus Torvalds 已提交
605 606
void __init paging_init(void)
{
607
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
608

609 610 611 612 613
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
	max_zone_pfns[ZONE_NORMAL] = end_pfn;

614 615
	memory_present(0, 0, end_pfn);
	sparse_init();
616
	free_area_init_nodes(max_zone_pfns);
L
Linus Torvalds 已提交
617 618 619
}
#endif

620 621 622 623 624 625
/*
 * Memory hotplug specific functions
 */
void online_page(struct page *page)
{
	ClearPageReserved(page);
626
	init_page_count(page);
627 628 629 630 631
	__free_page(page);
	totalram_pages++;
	num_physpages++;
}

632
#ifdef CONFIG_MEMORY_HOTPLUG
633 634 635 636
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
637
int arch_add_memory(int nid, u64 start, u64 size)
638
{
639
	struct pglist_data *pgdat = NODE_DATA(nid);
640
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
641
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
642 643 644
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

645 646 647
	last_mapped_pfn = init_memory_mapping(start, start + size-1);
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
648

649
	ret = __add_pages(zone, start_pfn, nr_pages);
650
	WARN_ON(1);
651 652 653

	return ret;
}
654
EXPORT_SYMBOL_GPL(arch_add_memory);
655

656
#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
657 658 659 660
int memory_add_physaddr_to_nid(u64 start)
{
	return 0;
}
661
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
662 663
#endif

664 665
#endif /* CONFIG_MEMORY_HOTPLUG */

666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685
/*
 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
 * is valid. The argument is a physical page number.
 *
 *
 * On x86, access has to be given to the first megabyte of ram because that area
 * contains bios code and data regions used by X and dosemu and similar apps.
 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
 * mmio resources as well as potential bios/acpi data regions.
 */
int devmem_is_allowed(unsigned long pagenr)
{
	if (pagenr <= 256)
		return 1;
	if (!page_is_ram(pagenr))
		return 1;
	return 0;
}


T
Thomas Gleixner 已提交
686 687
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
			 kcore_modules, kcore_vsyscall;
L
Linus Torvalds 已提交
688 689 690

void __init mem_init(void)
{
691
	long codesize, reservedpages, datasize, initsize;
L
Linus Torvalds 已提交
692

693
	pci_iommu_alloc();
L
Linus Torvalds 已提交
694

695
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
696 697 698 699

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
700
#ifdef CONFIG_NUMA
701
	totalram_pages = numa_free_all_bootmem();
L
Linus Torvalds 已提交
702
#else
703
	totalram_pages = free_all_bootmem();
L
Linus Torvalds 已提交
704
#endif
705 706
	reservedpages = end_pfn - totalram_pages -
					absent_pages_in_range(0, end_pfn);
L
Linus Torvalds 已提交
707 708 709 710 711 712 713
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
714 715
	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
L
Linus Torvalds 已提交
716 717 718
		   VMALLOC_END-VMALLOC_START);
	kclist_add(&kcore_kernel, &_stext, _end - _stext);
	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
T
Thomas Gleixner 已提交
719
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
L
Linus Torvalds 已提交
720 721
				 VSYSCALL_END - VSYSCALL_START);

722
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
T
Thomas Gleixner 已提交
723
				"%ldk reserved, %ldk data, %ldk init)\n",
L
Linus Torvalds 已提交
724 725 726 727 728 729
		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
		end_pfn << (PAGE_SHIFT-10),
		codesize >> 10,
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
730 731

	cpa_init();
L
Linus Torvalds 已提交
732 733
}

734
void free_init_pages(char *what, unsigned long begin, unsigned long end)
L
Linus Torvalds 已提交
735
{
736
	unsigned long addr = begin;
L
Linus Torvalds 已提交
737

738
	if (addr >= end)
739 740
		return;

I
Ingo Molnar 已提交
741 742 743 744 745 746 747 748 749 750
	/*
	 * If debugging page accesses then do not free this memory but
	 * mark them not present - any buggy init-section access will
	 * create a kernel page fault:
	 */
#ifdef CONFIG_DEBUG_PAGEALLOC
	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
		begin, PAGE_ALIGN(end));
	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
#else
751
	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
T
Thomas Gleixner 已提交
752

753
	for (; addr < end; addr += PAGE_SIZE) {
754 755 756 757 758
		ClearPageReserved(virt_to_page(addr));
		init_page_count(virt_to_page(addr));
		memset((void *)(addr & ~(PAGE_SIZE-1)),
			POISON_FREE_INITMEM, PAGE_SIZE);
		free_page(addr);
L
Linus Torvalds 已提交
759 760
		totalram_pages++;
	}
I
Ingo Molnar 已提交
761
#endif
762 763 764 765 766
}

void free_initmem(void)
{
	free_init_pages("unused kernel memory",
767 768
			(unsigned long)(&__init_begin),
			(unsigned long)(&__init_end));
L
Linus Torvalds 已提交
769 770
}

771
#ifdef CONFIG_DEBUG_RODATA
772 773
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
774 775 776

void mark_rodata_ro(void)
{
777
	unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
778

779
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
780
	       (end - start) >> 10);
781 782 783 784 785 786 787 788
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
	start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
	set_memory_nx(start, (end - start) >> PAGE_SHIFT);
789

790 791
	rodata_test();

792
#ifdef CONFIG_CPA_DEBUG
793
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
794
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
795

796
	printk(KERN_INFO "Testing CPA: again\n");
797
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
798
#endif
799
}
800

801 802
#endif

L
Linus Torvalds 已提交
803 804 805
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
806
	free_init_pages("initrd memory", start, end);
L
Linus Torvalds 已提交
807 808 809
}
#endif

T
Thomas Gleixner 已提交
810 811
void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
{
812
#ifdef CONFIG_NUMA
813
	int nid, next_nid;
814 815
#endif
	unsigned long pfn = phys >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
816

817
	if (pfn >= end_pfn) {
T
Thomas Gleixner 已提交
818 819 820 821
		/*
		 * This can happen with kdump kernels when accessing
		 * firmware tables:
		 */
822
		if (pfn < max_pfn_mapped)
823
			return;
T
Thomas Gleixner 已提交
824

825 826 827 828 829 830 831
		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
				phys, len);
		return;
	}

	/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
832 833 834 835 836 837
	nid = phys_to_nid(phys);
	next_nid = phys_to_nid(phys + len - 1);
	if (nid == next_nid)
		reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
	else
		reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
T
Thomas Gleixner 已提交
838
#else
839
	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
L
Linus Torvalds 已提交
840
#endif
841

842
	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
843
		dma_reserve += len / PAGE_SIZE;
844 845
		set_dma_reserve(dma_reserve);
	}
L
Linus Torvalds 已提交
846 847
}

T
Thomas Gleixner 已提交
848 849
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
850
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
851 852 853 854
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
855 856

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
857 858
		return 0;

L
Linus Torvalds 已提交
859 860 861 862 863 864
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
865
		return 0;
L
Linus Torvalds 已提交
866 867 868 869

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
870

L
Linus Torvalds 已提交
871 872 873 874 875 876
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
877

L
Linus Torvalds 已提交
878 879 880
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
881 882 883 884 885
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
886
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
887 888 889 890
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
891 892 893 894 895
};

struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
#ifdef CONFIG_IA32_EMULATION
896 897
	if (test_tsk_thread_flag(tsk, TIF_IA32))
		return NULL;
L
Linus Torvalds 已提交
898 899 900 901 902 903 904
#endif
	return &gate_vma;
}

int in_gate_area(struct task_struct *task, unsigned long addr)
{
	struct vm_area_struct *vma = get_gate_vma(task);
T
Thomas Gleixner 已提交
905

906 907
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
908

L
Linus Torvalds 已提交
909 910 911
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
912 913 914 915
/*
 * Use this when you have no reliable task/vma, typically from interrupt
 * context. It is less reliable than using the task's vma and may give
 * false positives:
L
Linus Torvalds 已提交
916 917 918
 */
int in_gate_area_no_task(unsigned long addr)
{
919
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
920
}
921

922 923 924 925 926 927 928 929
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
930 931 932 933 934

#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
935 936 937 938
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

T
Thomas Gleixner 已提交
939 940
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
941 942 943 944 945 946 947 948 949 950 951 952 953 954
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
		next = pmd_addr_end(addr, end);

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
955

956 957 958 959 960 961 962
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

		pmd = pmd_offset(pud, addr);
		if (pmd_none(*pmd)) {
			pte_t entry;
T
Thomas Gleixner 已提交
963 964 965
			void *p;

			p = vmemmap_alloc_block(PMD_SIZE, node);
966 967 968
			if (!p)
				return -ENOMEM;

T
Thomas Gleixner 已提交
969 970
			entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
							PAGE_KERNEL_LARGE);
971 972
			set_pmd(pmd, __pmd(pte_val(entry)));

973 974 975 976 977 978 979 980 981 982 983
			/* check to see if we have contiguous blocks */
			if (p_end != p || node_start != node) {
				if (p_start)
					printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						addr_start, addr_end-1, p_start, p_end-1, node_start);
				addr_start = addr;
				node_start = node;
				p_start = p;
			}
			addr_end = addr + PMD_SIZE;
			p_end = p + PMD_SIZE;
T
Thomas Gleixner 已提交
984
		} else {
985
			vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
986
		}
987 988 989
	}
	return 0;
}
990 991 992 993 994 995 996 997 998 999 1000

void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
1001
#endif