init_64.c 25.1 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
T
Thomas Gleixner 已提交
21
#include <linux/initrd.h>
L
Linus Torvalds 已提交
22 23 24
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
25
#include <linux/pci.h>
26
#include <linux/pfn.h>
27
#include <linux/poison.h>
28
#include <linux/dma-mapping.h>
29 30
#include <linux/module.h>
#include <linux/memory_hotplug.h>
31
#include <linux/nmi.h>
L
Linus Torvalds 已提交
32 33 34 35 36 37 38 39 40 41 42 43 44 45

#include <asm/processor.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
46
#include <asm/sections.h>
47
#include <asm/kdebug.h>
48
#include <asm/numa.h>
49
#include <asm/cacheflush.h>
L
Linus Torvalds 已提交
50

51 52 53 54 55 56 57
/*
 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
 * The direct mapping extends to max_pfn_mapped, so that we can directly access
 * apertures, ACPI and other tables without having to play with fixmaps.
 */
unsigned long max_pfn_mapped;

58 59
static unsigned long dma_reserve __initdata;

L
Linus Torvalds 已提交
60 61
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);

I
Ingo Molnar 已提交
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
int direct_gbpages __meminitdata
#ifdef CONFIG_DIRECT_GBPAGES
				= 1
#endif
;

static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
82 83 84 85 86 87 88 89
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

void show_mem(void)
{
90 91
	long i, total = 0, reserved = 0;
	long shared = 0, cached = 0;
L
Linus Torvalds 已提交
92
	struct page *page;
T
Thomas Gleixner 已提交
93
	pg_data_t *pgdat;
L
Linus Torvalds 已提交
94

95
	printk(KERN_INFO "Mem-info:\n");
L
Linus Torvalds 已提交
96
	show_free_areas();
97
	for_each_online_pgdat(pgdat) {
T
Thomas Gleixner 已提交
98 99 100 101 102 103
		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
			/*
			 * This loop can take a while with 256 GB and
			 * 4k pages so defer the NMI watchdog:
			 */
			if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
104
				touch_nmi_watchdog();
T
Thomas Gleixner 已提交
105

B
Bob Picco 已提交
106 107
			if (!pfn_valid(pgdat->node_start_pfn + i))
				continue;
T
Thomas Gleixner 已提交
108

L
Linus Torvalds 已提交
109 110
			page = pfn_to_page(pgdat->node_start_pfn + i);
			total++;
111 112 113 114 115 116
			if (PageReserved(page))
				reserved++;
			else if (PageSwapCache(page))
				cached++;
			else if (page_count(page))
				shared += page_count(page) - 1;
T
Thomas Gleixner 已提交
117
		}
L
Linus Torvalds 已提交
118
	}
T
Thomas Gleixner 已提交
119 120 121 122
	printk(KERN_INFO "%lu pages of RAM\n",		total);
	printk(KERN_INFO "%lu reserved pages\n",	reserved);
	printk(KERN_INFO "%lu pages shared\n",		shared);
	printk(KERN_INFO "%lu pages swap cached\n",	cached);
L
Linus Torvalds 已提交
123 124 125 126
}

int after_bootmem;

127
static __init void *spp_getpage(void)
T
Thomas Gleixner 已提交
128
{
L
Linus Torvalds 已提交
129
	void *ptr;
T
Thomas Gleixner 已提交
130

L
Linus Torvalds 已提交
131
	if (after_bootmem)
T
Thomas Gleixner 已提交
132
		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
L
Linus Torvalds 已提交
133 134
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
135 136 137 138 139

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
140

141
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
142

L
Linus Torvalds 已提交
143
	return ptr;
T
Thomas Gleixner 已提交
144
}
L
Linus Torvalds 已提交
145

146
void
147
set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
L
Linus Torvalds 已提交
148 149 150
{
	pud_t *pud;
	pmd_t *pmd;
151
	pte_t *pte;
L
Linus Torvalds 已提交
152

153
	pud = pud_page + pud_index(vaddr);
L
Linus Torvalds 已提交
154
	if (pud_none(*pud)) {
T
Thomas Gleixner 已提交
155
		pmd = (pmd_t *) spp_getpage();
156
		pud_populate(&init_mm, pud, pmd);
L
Linus Torvalds 已提交
157
		if (pmd != pmd_offset(pud, 0)) {
158
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
T
Thomas Gleixner 已提交
159
				pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
160 161 162 163 164 165
			return;
		}
	}
	pmd = pmd_offset(pud, vaddr);
	if (pmd_none(*pmd)) {
		pte = (pte_t *) spp_getpage();
166
		pmd_populate_kernel(&init_mm, pmd, pte);
L
Linus Torvalds 已提交
167
		if (pte != pte_offset_kernel(pmd, 0)) {
168
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
169 170 171 172 173
			return;
		}
	}

	pte = pte_offset_kernel(pmd, vaddr);
I
Ingo Molnar 已提交
174
	if (!pte_none(*pte) && pte_val(new_pte) &&
L
Linus Torvalds 已提交
175 176 177 178 179 180 181 182 183 184 185
	    pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
		pte_ERROR(*pte);
	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
void
set_pte_vaddr(unsigned long vaddr, pte_t pteval)
{
	pgd_t *pgd;
	pud_t *pud_page;

	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
		return;
	}
	pud_page = (pud_t*)pgd_page_vaddr(*pgd);
	set_pte_vaddr_pud(pud_page, vaddr, pteval);
}

204
/*
205 206 207
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
 * We limit the mappings to the region from _text to _end.  _end is
 * rounded up to the 2MB boundary. This catches the invalid pmds as
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
	unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
	pmd_t *pmd = level2_kernel_pgt;
	pmd_t *last_pmd = pmd + PTRS_PER_PMD;

	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
225
		if (pmd_none(*pmd))
226 227 228 229 230 231
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

232 233
static unsigned long __initdata table_start;
static unsigned long __meminitdata table_end;
234
static unsigned long __meminitdata table_top;
L
Linus Torvalds 已提交
235

236
static __meminit void *alloc_low_page(unsigned long *phys)
T
Thomas Gleixner 已提交
237
{
238
	unsigned long pfn = table_end++;
L
Linus Torvalds 已提交
239 240
	void *adr;

241 242 243
	if (after_bootmem) {
		adr = (void *)get_zeroed_page(GFP_ATOMIC);
		*phys = __pa(adr);
T
Thomas Gleixner 已提交
244

245 246 247
		return adr;
	}

248
	if (pfn >= table_top)
T
Thomas Gleixner 已提交
249
		panic("alloc_low_page: ran out of memory");
250 251

	adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
252
	memset(adr, 0, PAGE_SIZE);
253 254 255
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
L
Linus Torvalds 已提交
256

257
static __meminit void unmap_low_page(void *adr)
T
Thomas Gleixner 已提交
258
{
259 260 261
	if (after_bootmem)
		return;

262
	early_iounmap(adr, PAGE_SIZE);
T
Thomas Gleixner 已提交
263
}
L
Linus Torvalds 已提交
264

265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301
static void __meminit
phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
{
	unsigned pages = 0;
	int i;
	pte_t *pte = pte_page + pte_index(addr);

	for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {

		if (addr >= end) {
			if (!after_bootmem) {
				for(; i < PTRS_PER_PTE; i++, pte++)
					set_pte(pte, __pte(0));
			}
			break;
		}

		if (pte_val(*pte))
			continue;

		if (0)
			printk("   pte=%p addr=%lx pte=%016lx\n",
			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
		pages++;
	}
	update_page_count(PG_LEVEL_4K, pages);
}

static void __meminit
phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
{
	pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);

	phys_pte_init(pte, address, end);
}

302
static unsigned long __meminit
303
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
304
{
305 306
	unsigned long pages = 0;

307
	int i = pmd_index(address);
308

309
	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
310
		unsigned long pte_phys;
311
		pmd_t *pmd = pmd_page + pmd_index(address);
312
		pte_t *pte;
313

314
		if (address >= end) {
T
Thomas Gleixner 已提交
315
			if (!after_bootmem) {
316 317
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
T
Thomas Gleixner 已提交
318
			}
319 320
			break;
		}
321

322 323 324 325 326 327 328 329 330
		if (pmd_val(*pmd)) {
			phys_pte_update(pmd, address, end);
			continue;
		}

		if (cpu_has_pse) {
			pages++;
			set_pte((pte_t *)pmd,
				pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
331
			continue;
332
		}
333

334 335 336 337 338
		pte = alloc_low_page(&pte_phys);
		phys_pte_init(pte, address, end);
		unmap_low_page(pte);

		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
339
	}
340
	update_page_count(PG_LEVEL_2M, pages);
341
	return address;
342 343
}

344
static unsigned long __meminit
345 346
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
{
T
Thomas Gleixner 已提交
347
	pmd_t *pmd = pmd_offset(pud, 0);
348 349
	unsigned long last_map_addr;

350
	spin_lock(&init_mm.page_table_lock);
351
	last_map_addr = phys_pmd_init(pmd, address, end);
352 353
	spin_unlock(&init_mm.page_table_lock);
	__flush_tlb_all();
354
	return last_map_addr;
355 356
}

357
static unsigned long __meminit
T
Thomas Gleixner 已提交
358 359
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
{
360
	unsigned long pages = 0;
361
	unsigned long last_map_addr = end;
362
	int i = pud_index(addr);
363

T
Thomas Gleixner 已提交
364
	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
365 366
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
367 368
		pmd_t *pmd;

369
		if (addr >= end)
L
Linus Torvalds 已提交
370 371
			break;

T
Thomas Gleixner 已提交
372 373 374
		if (!after_bootmem &&
				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
			set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
375
			continue;
T
Thomas Gleixner 已提交
376
		}
L
Linus Torvalds 已提交
377

378
		if (pud_val(*pud)) {
379
			if (!pud_large(*pud))
380
				last_map_addr = phys_pmd_update(pud, addr, end);
381 382 383 384
			continue;
		}

		if (direct_gbpages) {
385
			pages++;
386 387
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
388
			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
389 390 391
			continue;
		}

392
		pmd = alloc_low_page(&pmd_phys);
T
Thomas Gleixner 已提交
393

394
		spin_lock(&init_mm.page_table_lock);
395
		last_map_addr = phys_pmd_init(pmd, addr, end);
396 397
		unmap_low_page(pmd);
		pud_populate(&init_mm, pud, __va(pmd_phys));
398
		spin_unlock(&init_mm.page_table_lock);
T
Thomas Gleixner 已提交
399

L
Linus Torvalds 已提交
400
	}
A
Andi Kleen 已提交
401
	__flush_tlb_all();
402
	update_page_count(PG_LEVEL_1G, pages);
403

404
	return last_map_addr;
T
Thomas Gleixner 已提交
405
}
L
Linus Torvalds 已提交
406

407 408 409 410 411 412 413 414 415 416
static unsigned long __meminit
phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end)
{
	pud_t *pud;

	pud = (pud_t *)pgd_page_vaddr(*pgd);

	return phys_pud_init(pud, addr, end);
}

L
Linus Torvalds 已提交
417 418
static void __init find_early_table_space(unsigned long end)
{
419
	unsigned long puds, tables, start;
L
Linus Torvalds 已提交
420 421

	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
422 423
	tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
	if (!direct_gbpages) {
424
		unsigned long pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
425 426
		tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
	}
427 428 429 430
	if (!cpu_has_pse) {
		unsigned long ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
		tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
	}
L
Linus Torvalds 已提交
431

T
Thomas Gleixner 已提交
432 433 434 435 436 437
	/*
	 * RED-PEN putting page tables only on node 0 could
	 * cause a hotspot and fill up ZONE_DMA. The page tables
	 * need roughly 0.5KB per GB.
	 */
	start = 0x8000;
438
	table_start = find_e820_area(start, end, tables, PAGE_SIZE);
L
Linus Torvalds 已提交
439 440 441 442 443
	if (table_start == -1UL)
		panic("Cannot find space for the kernel page tables");

	table_start >>= PAGE_SHIFT;
	table_end = table_start;
444
	table_top = table_start + (tables >> PAGE_SHIFT);
445

446 447
	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
		end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
L
Linus Torvalds 已提交
448 449
}

450 451 452 453 454 455 456 457
static void __init init_gbpages(void)
{
	if (direct_gbpages && cpu_has_gbpages)
		printk(KERN_INFO "Using GB pages for direct mapping\n");
	else
		direct_gbpages = 0;
}

458
#ifdef CONFIG_MEMTEST
Y
Yinghai Lu 已提交
459 460 461

static void __init memtest(unsigned long start_phys, unsigned long size,
				 unsigned pattern)
Y
Yinghai Lu 已提交
462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503
{
	unsigned long i;
	unsigned long *start;
	unsigned long start_bad;
	unsigned long last_bad;
	unsigned long val;
	unsigned long start_phys_aligned;
	unsigned long count;
	unsigned long incr;

	switch (pattern) {
	case 0:
		val = 0UL;
		break;
	case 1:
		val = -1UL;
		break;
	case 2:
		val = 0x5555555555555555UL;
		break;
	case 3:
		val = 0xaaaaaaaaaaaaaaaaUL;
		break;
	default:
		return;
	}

	incr = sizeof(unsigned long);
	start_phys_aligned = ALIGN(start_phys, incr);
	count = (size - (start_phys_aligned - start_phys))/incr;
	start = __va(start_phys_aligned);
	start_bad = 0;
	last_bad = 0;

	for (i = 0; i < count; i++)
		start[i] = val;
	for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
		if (*start != val) {
			if (start_phys_aligned == last_bad + incr) {
				last_bad += incr;
			} else {
				if (start_bad) {
Y
Yinghai Lu 已提交
504
					printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
Y
Yinghai Lu 已提交
505 506 507 508 509 510 511 512
						val, start_bad, last_bad + incr);
					reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
				}
				start_bad = last_bad = start_phys_aligned;
			}
		}
	}
	if (start_bad) {
Y
Yinghai Lu 已提交
513
		printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
Y
Yinghai Lu 已提交
514 515 516 517 518 519
			val, start_bad, last_bad + incr);
		reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
	}

}

520 521
/* default is disabled */
static int memtest_pattern __initdata;
Y
Yinghai Lu 已提交
522

Y
Yinghai Lu 已提交
523 524 525
static int __init parse_memtest(char *arg)
{
	if (arg)
Y
Yinghai Lu 已提交
526
		memtest_pattern = simple_strtoul(arg, NULL, 0);
Y
Yinghai Lu 已提交
527 528 529 530 531 532 533
	return 0;
}

early_param("memtest", parse_memtest);

static void __init early_memtest(unsigned long start, unsigned long end)
{
534
	u64 t_start, t_size;
Y
Yinghai Lu 已提交
535 536
	unsigned pattern;

Y
Yinghai Lu 已提交
537 538 539 540
	if (!memtest_pattern)
		return;

	printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
Y
Yinghai Lu 已提交
541 542 543 544 545 546 547 548 549 550 551 552
	for (pattern = 0; pattern < memtest_pattern; pattern++) {
		t_start = start;
		t_size = 0;
		while (t_start < end) {
			t_start = find_e820_area_size(t_start, &t_size, 1);

			/* done ? */
			if (t_start >= end)
				break;
			if (t_start + t_size > end)
				t_size = end - t_start;

553 554 555
			printk(KERN_CONT "\n  %016llx - %016llx pattern %d",
				(unsigned long long)t_start,
				(unsigned long long)t_start + t_size, pattern);
Y
Yinghai Lu 已提交
556 557 558 559 560 561

			memtest(t_start, t_size, pattern);

			t_start += t_size;
		}
	}
Y
Yinghai Lu 已提交
562
	printk(KERN_CONT "\n");
Y
Yinghai Lu 已提交
563
}
Y
Yinghai Lu 已提交
564 565 566 567 568
#else
static void __init early_memtest(unsigned long start, unsigned long end)
{
}
#endif
Y
Yinghai Lu 已提交
569

T
Thomas Gleixner 已提交
570 571 572 573 574
/*
 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
 * This runs before bootmem is initialized and gets pages directly from
 * the physical memory. To access them they are temporarily mapped.
 */
575
unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
T
Thomas Gleixner 已提交
576
{
577
	unsigned long next, last_map_addr = end;
Y
Yinghai Lu 已提交
578
	unsigned long start_phys = start, end_phys = end;
L
Linus Torvalds 已提交
579

Y
Yinghai Lu 已提交
580
	printk(KERN_INFO "init_memory_mapping\n");
L
Linus Torvalds 已提交
581

T
Thomas Gleixner 已提交
582
	/*
L
Linus Torvalds 已提交
583
	 * Find space for the kernel direct mapping tables.
T
Thomas Gleixner 已提交
584 585 586 587
	 *
	 * Later we should allocate these tables in the local node of the
	 * memory mapped. Unfortunately this is done currently before the
	 * nodes are discovered.
L
Linus Torvalds 已提交
588
	 */
589 590
	if (!after_bootmem) {
		init_gbpages();
591
		find_early_table_space(end);
592
	}
L
Linus Torvalds 已提交
593 594 595 596 597

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);

	for (; start < end; start = next) {
598
		pgd_t *pgd = pgd_offset_k(start);
T
Thomas Gleixner 已提交
599
		unsigned long pud_phys;
600 601
		pud_t *pud;

602 603 604 605 606 607 608 609 610
		next = start + PGDIR_SIZE;
		if (next > end)
			next = end;

		if (pgd_val(*pgd)) {
			last_map_addr = phys_pud_update(pgd, __pa(start), __pa(end));
			continue;
		}

611
		if (after_bootmem)
612
			pud = pud_offset(pgd, start & PGDIR_MASK);
613
		else
614
			pud = alloc_low_page(&pud_phys);
615

616
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
617
		unmap_low_page(pud);
618
		if (!after_bootmem)
619 620
			pgd_populate(&init_mm, pgd_offset_k(start),
				     __va(pud_phys));
T
Thomas Gleixner 已提交
621
	}
L
Linus Torvalds 已提交
622

623
	if (!after_bootmem)
624
		mmu_cr4_features = read_cr4();
L
Linus Torvalds 已提交
625
	__flush_tlb_all();
626

627 628 629
	if (!after_bootmem)
		reserve_early(table_start << PAGE_SHIFT,
				 table_end << PAGE_SHIFT, "PGTABLE");
Y
Yinghai Lu 已提交
630 631 632

	if (!after_bootmem)
		early_memtest(start_phys, end_phys);
633

634
	return last_map_addr >> PAGE_SHIFT;
L
Linus Torvalds 已提交
635 636
}

637
#ifndef CONFIG_NUMA
638 639 640 641 642 643 644 645 646
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long bootmap_size, bootmap;

	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
				 PAGE_SIZE);
	if (bootmap == -1L)
		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
Y
Yinghai Lu 已提交
647 648 649
	/* don't touch min_low_pfn */
	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
					 0, end_pfn);
650 651 652 653 654 655
	e820_register_active_regions(0, start_pfn, end_pfn);
	free_bootmem_with_active_regions(0, end_pfn);
	early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
	reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
}

L
Linus Torvalds 已提交
656 657
void __init paging_init(void)
{
658
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
659

660 661 662
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
Y
Yinghai Lu 已提交
663
	max_zone_pfns[ZONE_NORMAL] = max_pfn;
664

Y
Yinghai Lu 已提交
665
	memory_present(0, 0, max_pfn);
666
	sparse_init();
667
	free_area_init_nodes(max_zone_pfns);
L
Linus Torvalds 已提交
668 669 670
}
#endif

671 672 673
/*
 * Memory hotplug specific functions
 */
674
#ifdef CONFIG_MEMORY_HOTPLUG
675 676 677 678
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
679
int arch_add_memory(int nid, u64 start, u64 size)
680
{
681
	struct pglist_data *pgdat = NODE_DATA(nid);
682
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
683
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
684 685 686
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

687 688 689
	last_mapped_pfn = init_memory_mapping(start, start + size-1);
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
690

691
	ret = __add_pages(zone, start_pfn, nr_pages);
692
	WARN_ON(1);
693 694 695

	return ret;
}
696
EXPORT_SYMBOL_GPL(arch_add_memory);
697

698
#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
699 700 701 702
int memory_add_physaddr_to_nid(u64 start)
{
	return 0;
}
703
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
704 705
#endif

706 707
#endif /* CONFIG_MEMORY_HOTPLUG */

708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727
/*
 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
 * is valid. The argument is a physical page number.
 *
 *
 * On x86, access has to be given to the first megabyte of ram because that area
 * contains bios code and data regions used by X and dosemu and similar apps.
 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
 * mmio resources as well as potential bios/acpi data regions.
 */
int devmem_is_allowed(unsigned long pagenr)
{
	if (pagenr <= 256)
		return 1;
	if (!page_is_ram(pagenr))
		return 1;
	return 0;
}


T
Thomas Gleixner 已提交
728 729
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
			 kcore_modules, kcore_vsyscall;
L
Linus Torvalds 已提交
730 731 732

void __init mem_init(void)
{
733
	long codesize, reservedpages, datasize, initsize;
L
Linus Torvalds 已提交
734

735
	pci_iommu_alloc();
L
Linus Torvalds 已提交
736

737
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
738 739 740 741

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
742
#ifdef CONFIG_NUMA
743
	totalram_pages = numa_free_all_bootmem();
L
Linus Torvalds 已提交
744
#else
745
	totalram_pages = free_all_bootmem();
L
Linus Torvalds 已提交
746
#endif
Y
Yinghai Lu 已提交
747 748
	reservedpages = max_pfn - totalram_pages -
					absent_pages_in_range(0, max_pfn);
L
Linus Torvalds 已提交
749 750 751 752 753 754 755
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
756 757
	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
L
Linus Torvalds 已提交
758 759 760
		   VMALLOC_END-VMALLOC_START);
	kclist_add(&kcore_kernel, &_stext, _end - _stext);
	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
T
Thomas Gleixner 已提交
761
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
L
Linus Torvalds 已提交
762 763
				 VSYSCALL_END - VSYSCALL_START);

764
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
T
Thomas Gleixner 已提交
765
				"%ldk reserved, %ldk data, %ldk init)\n",
L
Linus Torvalds 已提交
766
		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
Y
Yinghai Lu 已提交
767
		max_pfn << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
768 769 770 771
		codesize >> 10,
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
772 773

	cpa_init();
L
Linus Torvalds 已提交
774 775
}

776
void free_init_pages(char *what, unsigned long begin, unsigned long end)
L
Linus Torvalds 已提交
777
{
778
	unsigned long addr = begin;
L
Linus Torvalds 已提交
779

780
	if (addr >= end)
781 782
		return;

I
Ingo Molnar 已提交
783 784 785 786 787 788 789 790 791 792
	/*
	 * If debugging page accesses then do not free this memory but
	 * mark them not present - any buggy init-section access will
	 * create a kernel page fault:
	 */
#ifdef CONFIG_DEBUG_PAGEALLOC
	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
		begin, PAGE_ALIGN(end));
	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
#else
793
	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
T
Thomas Gleixner 已提交
794

795
	for (; addr < end; addr += PAGE_SIZE) {
796 797 798 799 800
		ClearPageReserved(virt_to_page(addr));
		init_page_count(virt_to_page(addr));
		memset((void *)(addr & ~(PAGE_SIZE-1)),
			POISON_FREE_INITMEM, PAGE_SIZE);
		free_page(addr);
L
Linus Torvalds 已提交
801 802
		totalram_pages++;
	}
I
Ingo Molnar 已提交
803
#endif
804 805 806 807 808
}

void free_initmem(void)
{
	free_init_pages("unused kernel memory",
809 810
			(unsigned long)(&__init_begin),
			(unsigned long)(&__init_end));
L
Linus Torvalds 已提交
811 812
}

813
#ifdef CONFIG_DEBUG_RODATA
814 815
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
816 817 818

void mark_rodata_ro(void)
{
819
	unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
820

821
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
822
	       (end - start) >> 10);
823 824 825 826 827 828 829 830
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
	start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
	set_memory_nx(start, (end - start) >> PAGE_SHIFT);
831

832 833
	rodata_test();

834
#ifdef CONFIG_CPA_DEBUG
835
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
836
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
837

838
	printk(KERN_INFO "Testing CPA: again\n");
839
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
840
#endif
841
}
842

843 844
#endif

L
Linus Torvalds 已提交
845 846 847
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
848
	free_init_pages("initrd memory", start, end);
L
Linus Torvalds 已提交
849 850 851
}
#endif

852 853
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
				   int flags)
T
Thomas Gleixner 已提交
854
{
855
#ifdef CONFIG_NUMA
856
	int nid, next_nid;
857
	int ret;
858 859
#endif
	unsigned long pfn = phys >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
860

Y
Yinghai Lu 已提交
861
	if (pfn >= max_pfn) {
T
Thomas Gleixner 已提交
862 863 864 865
		/*
		 * This can happen with kdump kernels when accessing
		 * firmware tables:
		 */
866
		if (pfn < max_pfn_mapped)
867
			return -EFAULT;
T
Thomas Gleixner 已提交
868

869
		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
870
				phys, len);
871
		return -EFAULT;
872 873 874 875
	}

	/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
876 877 878
	nid = phys_to_nid(phys);
	next_nid = phys_to_nid(phys + len - 1);
	if (nid == next_nid)
879
		ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
880
	else
881 882 883 884 885
		ret = reserve_bootmem(phys, len, flags);

	if (ret != 0)
		return ret;

T
Thomas Gleixner 已提交
886
#else
887
	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
L
Linus Torvalds 已提交
888
#endif
889

890
	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
891
		dma_reserve += len / PAGE_SIZE;
892 893
		set_dma_reserve(dma_reserve);
	}
894 895

	return 0;
L
Linus Torvalds 已提交
896 897
}

T
Thomas Gleixner 已提交
898 899
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
900
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
901 902 903 904
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
905 906

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
907 908
		return 0;

L
Linus Torvalds 已提交
909 910 911 912 913 914
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
915
		return 0;
L
Linus Torvalds 已提交
916 917 918 919

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
920

L
Linus Torvalds 已提交
921 922 923 924 925 926
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
927

L
Linus Torvalds 已提交
928 929 930
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
931 932 933 934 935
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
936
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
937 938 939 940
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
941 942 943 944 945
};

struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
#ifdef CONFIG_IA32_EMULATION
946 947
	if (test_tsk_thread_flag(tsk, TIF_IA32))
		return NULL;
L
Linus Torvalds 已提交
948 949 950 951 952 953 954
#endif
	return &gate_vma;
}

int in_gate_area(struct task_struct *task, unsigned long addr)
{
	struct vm_area_struct *vma = get_gate_vma(task);
T
Thomas Gleixner 已提交
955

956 957
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
958

L
Linus Torvalds 已提交
959 960 961
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
962 963 964 965
/*
 * Use this when you have no reliable task/vma, typically from interrupt
 * context. It is less reliable than using the task's vma and may give
 * false positives:
L
Linus Torvalds 已提交
966 967 968
 */
int in_gate_area_no_task(unsigned long addr)
{
969
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
970
}
971

972 973 974 975 976 977 978 979
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
980 981 982 983 984

#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
985 986 987 988
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

T
Thomas Gleixner 已提交
989 990
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
991 992 993 994 995 996 997 998 999
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
1000
		void *p = NULL;
1001 1002 1003 1004

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
1005

1006 1007 1008 1009
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

1010 1011 1012 1013 1014 1015 1016 1017
		if (!cpu_has_pse) {
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			pmd = vmemmap_pmd_populate(pud, addr, node);

			if (!pmd)
				return -ENOMEM;

			p = vmemmap_pte_populate(pmd, addr, node);
T
Thomas Gleixner 已提交
1018

1019 1020 1021
			if (!p)
				return -ENOMEM;

1022 1023
			addr_end = addr + PAGE_SIZE;
			p_end = p + PAGE_SIZE;
T
Thomas Gleixner 已提交
1024
		} else {
1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052
			next = pmd_addr_end(addr, end);

			pmd = pmd_offset(pud, addr);
			if (pmd_none(*pmd)) {
				pte_t entry;

				p = vmemmap_alloc_block(PMD_SIZE, node);
				if (!p)
					return -ENOMEM;

				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
						PAGE_KERNEL_LARGE);
				set_pmd(pmd, __pmd(pte_val(entry)));

				addr_end = addr + PMD_SIZE;
				p_end = p + PMD_SIZE;

				/* check to see if we have contiguous blocks */
				if (p_end != p || node_start != node) {
					if (p_start)
						printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						       addr_start, addr_end-1, p_start, p_end-1, node_start);
					addr_start = addr;
					node_start = node;
					p_start = p;
				}
			} else
				vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
1053
		}
1054

1055 1056 1057
	}
	return 0;
}
1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068

void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
1069
#endif