init_64.c 24.2 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
T
Thomas Gleixner 已提交
21
#include <linux/initrd.h>
L
Linus Torvalds 已提交
22 23 24
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
25
#include <linux/pci.h>
26
#include <linux/pfn.h>
27
#include <linux/poison.h>
28
#include <linux/dma-mapping.h>
29 30
#include <linux/module.h>
#include <linux/memory_hotplug.h>
31
#include <linux/nmi.h>
L
Linus Torvalds 已提交
32 33 34 35 36 37 38 39 40 41 42 43 44 45

#include <asm/processor.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
46
#include <asm/sections.h>
47
#include <asm/kdebug.h>
48
#include <asm/numa.h>
49
#include <asm/cacheflush.h>
L
Linus Torvalds 已提交
50

51 52 53 54 55 56 57 58 59 60 61 62
/*
 * PFN of last memory page.
 */
unsigned long end_pfn;

/*
 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
 * The direct mapping extends to max_pfn_mapped, so that we can directly access
 * apertures, ACPI and other tables without having to play with fixmaps.
 */
unsigned long max_pfn_mapped;

63 64
static unsigned long dma_reserve __initdata;

L
Linus Torvalds 已提交
65 66
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);

I
Ingo Molnar 已提交
67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
int direct_gbpages __meminitdata
#ifdef CONFIG_DIRECT_GBPAGES
				= 1
#endif
;

static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
87 88 89 90 91 92 93 94
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

void show_mem(void)
{
95 96
	long i, total = 0, reserved = 0;
	long shared = 0, cached = 0;
L
Linus Torvalds 已提交
97
	struct page *page;
T
Thomas Gleixner 已提交
98
	pg_data_t *pgdat;
L
Linus Torvalds 已提交
99

100
	printk(KERN_INFO "Mem-info:\n");
L
Linus Torvalds 已提交
101
	show_free_areas();
102
	for_each_online_pgdat(pgdat) {
T
Thomas Gleixner 已提交
103 104 105 106 107 108
		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
			/*
			 * This loop can take a while with 256 GB and
			 * 4k pages so defer the NMI watchdog:
			 */
			if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
109
				touch_nmi_watchdog();
T
Thomas Gleixner 已提交
110

B
Bob Picco 已提交
111 112
			if (!pfn_valid(pgdat->node_start_pfn + i))
				continue;
T
Thomas Gleixner 已提交
113

L
Linus Torvalds 已提交
114 115
			page = pfn_to_page(pgdat->node_start_pfn + i);
			total++;
116 117 118 119 120 121
			if (PageReserved(page))
				reserved++;
			else if (PageSwapCache(page))
				cached++;
			else if (page_count(page))
				shared += page_count(page) - 1;
T
Thomas Gleixner 已提交
122
		}
L
Linus Torvalds 已提交
123
	}
T
Thomas Gleixner 已提交
124 125 126 127
	printk(KERN_INFO "%lu pages of RAM\n",		total);
	printk(KERN_INFO "%lu reserved pages\n",	reserved);
	printk(KERN_INFO "%lu pages shared\n",		shared);
	printk(KERN_INFO "%lu pages swap cached\n",	cached);
L
Linus Torvalds 已提交
128 129 130 131
}

int after_bootmem;

132
static __init void *spp_getpage(void)
T
Thomas Gleixner 已提交
133
{
L
Linus Torvalds 已提交
134
	void *ptr;
T
Thomas Gleixner 已提交
135

L
Linus Torvalds 已提交
136
	if (after_bootmem)
T
Thomas Gleixner 已提交
137
		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
L
Linus Torvalds 已提交
138 139
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
140 141 142 143 144

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
145

146
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
147

L
Linus Torvalds 已提交
148
	return ptr;
T
Thomas Gleixner 已提交
149
}
L
Linus Torvalds 已提交
150

151 152
void
set_pte_vaddr(unsigned long vaddr, pte_t new_pte)
L
Linus Torvalds 已提交
153 154 155 156
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
157
	pte_t *pte;
L
Linus Torvalds 已提交
158

159
	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(new_pte));
L
Linus Torvalds 已提交
160 161 162

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
163 164
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
L
Linus Torvalds 已提交
165 166 167 168
		return;
	}
	pud = pud_offset(pgd, vaddr);
	if (pud_none(*pud)) {
T
Thomas Gleixner 已提交
169
		pmd = (pmd_t *) spp_getpage();
L
Linus Torvalds 已提交
170 171
		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
		if (pmd != pmd_offset(pud, 0)) {
172
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
T
Thomas Gleixner 已提交
173
				pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
174 175 176 177 178 179 180 181
			return;
		}
	}
	pmd = pmd_offset(pud, vaddr);
	if (pmd_none(*pmd)) {
		pte = (pte_t *) spp_getpage();
		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
		if (pte != pte_offset_kernel(pmd, 0)) {
182
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
183 184 185 186 187
			return;
		}
	}

	pte = pte_offset_kernel(pmd, vaddr);
I
Ingo Molnar 已提交
188
	if (!pte_none(*pte) && pte_val(new_pte) &&
L
Linus Torvalds 已提交
189 190 191 192 193 194 195 196 197 198 199
	    pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
		pte_ERROR(*pte);
	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

200
/*
201 202 203
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
 * We limit the mappings to the region from _text to _end.  _end is
 * rounded up to the 2MB boundary. This catches the invalid pmds as
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
	unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
	pmd_t *pmd = level2_kernel_pgt;
	pmd_t *last_pmd = pmd + PTRS_PER_PMD;

	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
221
		if (pmd_none(*pmd))
222 223 224 225 226 227
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

228 229
static unsigned long __initdata table_start;
static unsigned long __meminitdata table_end;
L
Linus Torvalds 已提交
230

231
static __meminit void *alloc_low_page(unsigned long *phys)
T
Thomas Gleixner 已提交
232
{
233
	unsigned long pfn = table_end++;
L
Linus Torvalds 已提交
234 235
	void *adr;

236 237 238
	if (after_bootmem) {
		adr = (void *)get_zeroed_page(GFP_ATOMIC);
		*phys = __pa(adr);
T
Thomas Gleixner 已提交
239

240 241 242
		return adr;
	}

T
Thomas Gleixner 已提交
243 244
	if (pfn >= end_pfn)
		panic("alloc_low_page: ran out of memory");
245 246

	adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
247
	memset(adr, 0, PAGE_SIZE);
248 249 250
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
L
Linus Torvalds 已提交
251

252
static __meminit void unmap_low_page(void *adr)
T
Thomas Gleixner 已提交
253
{
254 255 256
	if (after_bootmem)
		return;

257
	early_iounmap(adr, PAGE_SIZE);
T
Thomas Gleixner 已提交
258
}
L
Linus Torvalds 已提交
259

260
/* Must run before zap_low_mappings */
261
__meminit void *early_ioremap(unsigned long addr, unsigned long size)
262
{
263
	pmd_t *pmd, *last_pmd;
T
Thomas Gleixner 已提交
264
	unsigned long vaddr;
265 266 267 268 269 270
	int i, pmds;

	pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
	vaddr = __START_KERNEL_map;
	pmd = level2_kernel_pgt;
	last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
T
Thomas Gleixner 已提交
271

272 273 274
	for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
		for (i = 0; i < pmds; i++) {
			if (pmd_present(pmd[i]))
T
Thomas Gleixner 已提交
275
				goto continue_outer_loop;
276 277 278
		}
		vaddr += addr & ~PMD_MASK;
		addr &= PMD_MASK;
T
Thomas Gleixner 已提交
279

280
		for (i = 0; i < pmds; i++, addr += PMD_SIZE)
281
			set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
A
Andi Kleen 已提交
282
		__flush_tlb_all();
T
Thomas Gleixner 已提交
283

284
		return (void *)vaddr;
T
Thomas Gleixner 已提交
285
continue_outer_loop:
286
		;
287
	}
288
	printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size);
T
Thomas Gleixner 已提交
289

290
	return NULL;
291 292
}

T
Thomas Gleixner 已提交
293 294 295
/*
 * To avoid virtual aliases later:
 */
296
__meminit void early_iounmap(void *addr, unsigned long size)
297
{
298 299 300 301 302 303 304
	unsigned long vaddr;
	pmd_t *pmd;
	int i, pmds;

	vaddr = (unsigned long)addr;
	pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
	pmd = level2_kernel_pgt + pmd_index(vaddr);
T
Thomas Gleixner 已提交
305

306 307
	for (i = 0; i < pmds; i++)
		pmd_clear(pmd + i);
T
Thomas Gleixner 已提交
308

A
Andi Kleen 已提交
309
	__flush_tlb_all();
310 311
}

312
static unsigned long __meminit
313
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
314
{
315 316
	unsigned long pages = 0;

317
	int i = pmd_index(address);
318

319 320
	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
		pmd_t *pmd = pmd_page + pmd_index(address);
321

322
		if (address >= end) {
T
Thomas Gleixner 已提交
323
			if (!after_bootmem) {
324 325
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
T
Thomas Gleixner 已提交
326
			}
327 328
			break;
		}
329 330 331 332

		if (pmd_val(*pmd))
			continue;

333
		pages++;
334 335
		set_pte((pte_t *)pmd,
			pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
336
	}
337
	update_page_count(PG_LEVEL_2M, pages);
338
	return address;
339 340
}

341
static unsigned long __meminit
342 343
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
{
T
Thomas Gleixner 已提交
344
	pmd_t *pmd = pmd_offset(pud, 0);
345 346
	unsigned long last_map_addr;

347
	spin_lock(&init_mm.page_table_lock);
348
	last_map_addr = phys_pmd_init(pmd, address, end);
349 350
	spin_unlock(&init_mm.page_table_lock);
	__flush_tlb_all();
351
	return last_map_addr;
352 353
}

354
static unsigned long __meminit
T
Thomas Gleixner 已提交
355 356
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
{
357
	unsigned long pages = 0;
358
	unsigned long last_map_addr = end;
359
	int i = pud_index(addr);
360

T
Thomas Gleixner 已提交
361
	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
362 363
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
364 365
		pmd_t *pmd;

366
		if (addr >= end)
L
Linus Torvalds 已提交
367 368
			break;

T
Thomas Gleixner 已提交
369 370 371
		if (!after_bootmem &&
				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
			set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
372
			continue;
T
Thomas Gleixner 已提交
373
		}
L
Linus Torvalds 已提交
374

375
		if (pud_val(*pud)) {
376
			if (!pud_large(*pud))
377
				last_map_addr = phys_pmd_update(pud, addr, end);
378 379 380 381
			continue;
		}

		if (direct_gbpages) {
382
			pages++;
383 384
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
385
			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
386 387 388
			continue;
		}

389
		pmd = alloc_low_page(&pmd_phys);
T
Thomas Gleixner 已提交
390

391
		spin_lock(&init_mm.page_table_lock);
L
Linus Torvalds 已提交
392
		set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
393
		last_map_addr = phys_pmd_init(pmd, addr, end);
394
		spin_unlock(&init_mm.page_table_lock);
T
Thomas Gleixner 已提交
395

396
		unmap_low_page(pmd);
L
Linus Torvalds 已提交
397
	}
A
Andi Kleen 已提交
398
	__flush_tlb_all();
399
	update_page_count(PG_LEVEL_1G, pages);
400 401

	return last_map_addr >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
402
}
L
Linus Torvalds 已提交
403 404 405

static void __init find_early_table_space(unsigned long end)
{
406
	unsigned long puds, pmds, tables, start;
L
Linus Torvalds 已提交
407 408

	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
409 410 411 412 413
	tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
	if (!direct_gbpages) {
		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
		tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
	}
L
Linus Torvalds 已提交
414

T
Thomas Gleixner 已提交
415 416 417 418 419 420
	/*
	 * RED-PEN putting page tables only on node 0 could
	 * cause a hotspot and fill up ZONE_DMA. The page tables
	 * need roughly 0.5KB per GB.
	 */
	start = 0x8000;
421
	table_start = find_e820_area(start, end, tables, PAGE_SIZE);
L
Linus Torvalds 已提交
422 423 424 425 426
	if (table_start == -1UL)
		panic("Cannot find space for the kernel page tables");

	table_start >>= PAGE_SHIFT;
	table_end = table_start;
427 428

	early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
429 430
		end, table_start << PAGE_SHIFT,
		(table_start << PAGE_SHIFT) + tables);
L
Linus Torvalds 已提交
431 432
}

433 434 435 436 437 438 439 440
static void __init init_gbpages(void)
{
	if (direct_gbpages && cpu_has_gbpages)
		printk(KERN_INFO "Using GB pages for direct mapping\n");
	else
		direct_gbpages = 0;
}

441
#ifdef CONFIG_MEMTEST
Y
Yinghai Lu 已提交
442 443 444

static void __init memtest(unsigned long start_phys, unsigned long size,
				 unsigned pattern)
Y
Yinghai Lu 已提交
445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
{
	unsigned long i;
	unsigned long *start;
	unsigned long start_bad;
	unsigned long last_bad;
	unsigned long val;
	unsigned long start_phys_aligned;
	unsigned long count;
	unsigned long incr;

	switch (pattern) {
	case 0:
		val = 0UL;
		break;
	case 1:
		val = -1UL;
		break;
	case 2:
		val = 0x5555555555555555UL;
		break;
	case 3:
		val = 0xaaaaaaaaaaaaaaaaUL;
		break;
	default:
		return;
	}

	incr = sizeof(unsigned long);
	start_phys_aligned = ALIGN(start_phys, incr);
	count = (size - (start_phys_aligned - start_phys))/incr;
	start = __va(start_phys_aligned);
	start_bad = 0;
	last_bad = 0;

	for (i = 0; i < count; i++)
		start[i] = val;
	for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
		if (*start != val) {
			if (start_phys_aligned == last_bad + incr) {
				last_bad += incr;
			} else {
				if (start_bad) {
Y
Yinghai Lu 已提交
487
					printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
Y
Yinghai Lu 已提交
488 489 490 491 492 493 494 495
						val, start_bad, last_bad + incr);
					reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
				}
				start_bad = last_bad = start_phys_aligned;
			}
		}
	}
	if (start_bad) {
Y
Yinghai Lu 已提交
496
		printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
Y
Yinghai Lu 已提交
497 498 499 500 501 502
			val, start_bad, last_bad + incr);
		reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
	}

}

503 504
/* default is disabled */
static int memtest_pattern __initdata;
Y
Yinghai Lu 已提交
505

Y
Yinghai Lu 已提交
506 507 508
static int __init parse_memtest(char *arg)
{
	if (arg)
Y
Yinghai Lu 已提交
509
		memtest_pattern = simple_strtoul(arg, NULL, 0);
Y
Yinghai Lu 已提交
510 511 512 513 514 515 516
	return 0;
}

early_param("memtest", parse_memtest);

static void __init early_memtest(unsigned long start, unsigned long end)
{
517
	u64 t_start, t_size;
Y
Yinghai Lu 已提交
518 519
	unsigned pattern;

Y
Yinghai Lu 已提交
520 521 522 523
	if (!memtest_pattern)
		return;

	printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
Y
Yinghai Lu 已提交
524 525 526 527 528 529 530 531 532 533 534 535
	for (pattern = 0; pattern < memtest_pattern; pattern++) {
		t_start = start;
		t_size = 0;
		while (t_start < end) {
			t_start = find_e820_area_size(t_start, &t_size, 1);

			/* done ? */
			if (t_start >= end)
				break;
			if (t_start + t_size > end)
				t_size = end - t_start;

536 537 538
			printk(KERN_CONT "\n  %016llx - %016llx pattern %d",
				(unsigned long long)t_start,
				(unsigned long long)t_start + t_size, pattern);
Y
Yinghai Lu 已提交
539 540 541 542 543 544

			memtest(t_start, t_size, pattern);

			t_start += t_size;
		}
	}
Y
Yinghai Lu 已提交
545
	printk(KERN_CONT "\n");
Y
Yinghai Lu 已提交
546
}
Y
Yinghai Lu 已提交
547 548 549 550 551
#else
static void __init early_memtest(unsigned long start, unsigned long end)
{
}
#endif
Y
Yinghai Lu 已提交
552

T
Thomas Gleixner 已提交
553 554 555 556 557
/*
 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
 * This runs before bootmem is initialized and gets pages directly from
 * the physical memory. To access them they are temporarily mapped.
 */
558
unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
T
Thomas Gleixner 已提交
559
{
560
	unsigned long next, last_map_addr = end;
Y
Yinghai Lu 已提交
561
	unsigned long start_phys = start, end_phys = end;
L
Linus Torvalds 已提交
562

Y
Yinghai Lu 已提交
563
	printk(KERN_INFO "init_memory_mapping\n");
L
Linus Torvalds 已提交
564

T
Thomas Gleixner 已提交
565
	/*
L
Linus Torvalds 已提交
566
	 * Find space for the kernel direct mapping tables.
T
Thomas Gleixner 已提交
567 568 569 570
	 *
	 * Later we should allocate these tables in the local node of the
	 * memory mapped. Unfortunately this is done currently before the
	 * nodes are discovered.
L
Linus Torvalds 已提交
571
	 */
572 573
	if (!after_bootmem) {
		init_gbpages();
574
		find_early_table_space(end);
575
	}
L
Linus Torvalds 已提交
576 577 578 579 580

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);

	for (; start < end; start = next) {
581
		pgd_t *pgd = pgd_offset_k(start);
T
Thomas Gleixner 已提交
582
		unsigned long pud_phys;
583 584 585
		pud_t *pud;

		if (after_bootmem)
586
			pud = pud_offset(pgd, start & PGDIR_MASK);
587
		else
588
			pud = alloc_low_page(&pud_phys);
589

L
Linus Torvalds 已提交
590
		next = start + PGDIR_SIZE;
T
Thomas Gleixner 已提交
591 592
		if (next > end)
			next = end;
593
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
594 595
		if (!after_bootmem)
			set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
596
		unmap_low_page(pud);
T
Thomas Gleixner 已提交
597
	}
L
Linus Torvalds 已提交
598

599
	if (!after_bootmem)
600
		mmu_cr4_features = read_cr4();
L
Linus Torvalds 已提交
601
	__flush_tlb_all();
602

603 604 605
	if (!after_bootmem)
		reserve_early(table_start << PAGE_SHIFT,
				 table_end << PAGE_SHIFT, "PGTABLE");
Y
Yinghai Lu 已提交
606 607 608

	if (!after_bootmem)
		early_memtest(start_phys, end_phys);
609 610

	return last_map_addr;
L
Linus Torvalds 已提交
611 612
}

613
#ifndef CONFIG_NUMA
614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long bootmap_size, bootmap;

	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
				 PAGE_SIZE);
	if (bootmap == -1L)
		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
	bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
	e820_register_active_regions(0, start_pfn, end_pfn);
	free_bootmem_with_active_regions(0, end_pfn);
	early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
	reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
}

L
Linus Torvalds 已提交
630 631
void __init paging_init(void)
{
632
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
633

634 635 636 637 638
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
	max_zone_pfns[ZONE_NORMAL] = end_pfn;

639 640
	memory_present(0, 0, end_pfn);
	sparse_init();
641
	free_area_init_nodes(max_zone_pfns);
L
Linus Torvalds 已提交
642 643 644
}
#endif

645 646 647
/*
 * Memory hotplug specific functions
 */
648
#ifdef CONFIG_MEMORY_HOTPLUG
649 650 651 652
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
653
int arch_add_memory(int nid, u64 start, u64 size)
654
{
655
	struct pglist_data *pgdat = NODE_DATA(nid);
656
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
657
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
658 659 660
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

661 662 663
	last_mapped_pfn = init_memory_mapping(start, start + size-1);
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
664

665
	ret = __add_pages(zone, start_pfn, nr_pages);
666
	WARN_ON(1);
667 668 669

	return ret;
}
670
EXPORT_SYMBOL_GPL(arch_add_memory);
671

672
#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
673 674 675 676
int memory_add_physaddr_to_nid(u64 start)
{
	return 0;
}
677
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
678 679
#endif

680 681
#endif /* CONFIG_MEMORY_HOTPLUG */

682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701
/*
 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
 * is valid. The argument is a physical page number.
 *
 *
 * On x86, access has to be given to the first megabyte of ram because that area
 * contains bios code and data regions used by X and dosemu and similar apps.
 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
 * mmio resources as well as potential bios/acpi data regions.
 */
int devmem_is_allowed(unsigned long pagenr)
{
	if (pagenr <= 256)
		return 1;
	if (!page_is_ram(pagenr))
		return 1;
	return 0;
}


T
Thomas Gleixner 已提交
702 703
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
			 kcore_modules, kcore_vsyscall;
L
Linus Torvalds 已提交
704 705 706

void __init mem_init(void)
{
707
	long codesize, reservedpages, datasize, initsize;
L
Linus Torvalds 已提交
708

709
	pci_iommu_alloc();
L
Linus Torvalds 已提交
710

711
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
712 713 714 715

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
716
#ifdef CONFIG_NUMA
717
	totalram_pages = numa_free_all_bootmem();
L
Linus Torvalds 已提交
718
#else
719
	totalram_pages = free_all_bootmem();
L
Linus Torvalds 已提交
720
#endif
721 722
	reservedpages = end_pfn - totalram_pages -
					absent_pages_in_range(0, end_pfn);
L
Linus Torvalds 已提交
723 724 725 726 727 728 729
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
730 731
	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
L
Linus Torvalds 已提交
732 733 734
		   VMALLOC_END-VMALLOC_START);
	kclist_add(&kcore_kernel, &_stext, _end - _stext);
	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
T
Thomas Gleixner 已提交
735
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
L
Linus Torvalds 已提交
736 737
				 VSYSCALL_END - VSYSCALL_START);

738
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
T
Thomas Gleixner 已提交
739
				"%ldk reserved, %ldk data, %ldk init)\n",
L
Linus Torvalds 已提交
740 741 742 743 744 745
		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
		end_pfn << (PAGE_SHIFT-10),
		codesize >> 10,
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
746 747

	cpa_init();
L
Linus Torvalds 已提交
748 749
}

750
void free_init_pages(char *what, unsigned long begin, unsigned long end)
L
Linus Torvalds 已提交
751
{
752
	unsigned long addr = begin;
L
Linus Torvalds 已提交
753

754
	if (addr >= end)
755 756
		return;

I
Ingo Molnar 已提交
757 758 759 760 761 762 763 764 765 766
	/*
	 * If debugging page accesses then do not free this memory but
	 * mark them not present - any buggy init-section access will
	 * create a kernel page fault:
	 */
#ifdef CONFIG_DEBUG_PAGEALLOC
	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
		begin, PAGE_ALIGN(end));
	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
#else
767
	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
T
Thomas Gleixner 已提交
768

769
	for (; addr < end; addr += PAGE_SIZE) {
770 771 772 773 774
		ClearPageReserved(virt_to_page(addr));
		init_page_count(virt_to_page(addr));
		memset((void *)(addr & ~(PAGE_SIZE-1)),
			POISON_FREE_INITMEM, PAGE_SIZE);
		free_page(addr);
L
Linus Torvalds 已提交
775 776
		totalram_pages++;
	}
I
Ingo Molnar 已提交
777
#endif
778 779 780 781 782
}

void free_initmem(void)
{
	free_init_pages("unused kernel memory",
783 784
			(unsigned long)(&__init_begin),
			(unsigned long)(&__init_end));
L
Linus Torvalds 已提交
785 786
}

787
#ifdef CONFIG_DEBUG_RODATA
788 789
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
790 791 792

void mark_rodata_ro(void)
{
793
	unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
794

795
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
796
	       (end - start) >> 10);
797 798 799 800 801 802 803 804
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
	start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
	set_memory_nx(start, (end - start) >> PAGE_SHIFT);
805

806 807
	rodata_test();

808
#ifdef CONFIG_CPA_DEBUG
809
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
810
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
811

812
	printk(KERN_INFO "Testing CPA: again\n");
813
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
814
#endif
815
}
816

817 818
#endif

L
Linus Torvalds 已提交
819 820 821
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
822
	free_init_pages("initrd memory", start, end);
L
Linus Torvalds 已提交
823 824 825
}
#endif

826 827
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
				   int flags)
T
Thomas Gleixner 已提交
828
{
829
#ifdef CONFIG_NUMA
830
	int nid, next_nid;
831 832
#endif
	unsigned long pfn = phys >> PAGE_SHIFT;
833
	int ret;
T
Thomas Gleixner 已提交
834

835
	if (pfn >= end_pfn) {
T
Thomas Gleixner 已提交
836 837 838 839
		/*
		 * This can happen with kdump kernels when accessing
		 * firmware tables:
		 */
840
		if (pfn < max_pfn_mapped)
841
			return -EFAULT;
T
Thomas Gleixner 已提交
842

843 844
		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
				phys, len);
845
		return -EFAULT;
846 847 848 849
	}

	/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
850 851 852
	nid = phys_to_nid(phys);
	next_nid = phys_to_nid(phys + len - 1);
	if (nid == next_nid)
853
		ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
854
	else
855 856 857 858 859
		ret = reserve_bootmem(phys, len, flags);

	if (ret != 0)
		return ret;

T
Thomas Gleixner 已提交
860
#else
861
	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
L
Linus Torvalds 已提交
862
#endif
863

864
	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
865
		dma_reserve += len / PAGE_SIZE;
866 867
		set_dma_reserve(dma_reserve);
	}
868 869

	return 0;
L
Linus Torvalds 已提交
870 871
}

T
Thomas Gleixner 已提交
872 873
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
874
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
875 876 877 878
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
879 880

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
881 882
		return 0;

L
Linus Torvalds 已提交
883 884 885 886 887 888
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
889
		return 0;
L
Linus Torvalds 已提交
890 891 892 893

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
894

L
Linus Torvalds 已提交
895 896 897 898 899 900
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
901

L
Linus Torvalds 已提交
902 903 904
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
905 906 907 908 909
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
910
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
911 912 913 914
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
915 916 917 918 919
};

struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
#ifdef CONFIG_IA32_EMULATION
920 921
	if (test_tsk_thread_flag(tsk, TIF_IA32))
		return NULL;
L
Linus Torvalds 已提交
922 923 924 925 926 927 928
#endif
	return &gate_vma;
}

int in_gate_area(struct task_struct *task, unsigned long addr)
{
	struct vm_area_struct *vma = get_gate_vma(task);
T
Thomas Gleixner 已提交
929

930 931
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
932

L
Linus Torvalds 已提交
933 934 935
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
936 937 938 939
/*
 * Use this when you have no reliable task/vma, typically from interrupt
 * context. It is less reliable than using the task's vma and may give
 * false positives:
L
Linus Torvalds 已提交
940 941 942
 */
int in_gate_area_no_task(unsigned long addr)
{
943
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
944
}
945

946 947 948 949 950 951 952 953
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
954 955 956 957 958

#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
959 960 961 962
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

T
Thomas Gleixner 已提交
963 964
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
965 966 967 968 969 970 971 972 973 974 975 976 977 978
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
		next = pmd_addr_end(addr, end);

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
979

980 981 982 983 984 985 986
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

		pmd = pmd_offset(pud, addr);
		if (pmd_none(*pmd)) {
			pte_t entry;
T
Thomas Gleixner 已提交
987 988 989
			void *p;

			p = vmemmap_alloc_block(PMD_SIZE, node);
990 991 992
			if (!p)
				return -ENOMEM;

T
Thomas Gleixner 已提交
993 994
			entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
							PAGE_KERNEL_LARGE);
995 996
			set_pmd(pmd, __pmd(pte_val(entry)));

997 998 999 1000 1001 1002 1003 1004 1005 1006 1007
			/* check to see if we have contiguous blocks */
			if (p_end != p || node_start != node) {
				if (p_start)
					printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						addr_start, addr_end-1, p_start, p_end-1, node_start);
				addr_start = addr;
				node_start = node;
				p_start = p;
			}
			addr_end = addr + PMD_SIZE;
			p_end = p + PMD_SIZE;
T
Thomas Gleixner 已提交
1008
		} else {
1009
			vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
1010
		}
1011 1012 1013
	}
	return 0;
}
1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024

void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
1025
#endif