init_64.c 23.1 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
T
Thomas Gleixner 已提交
21
#include <linux/initrd.h>
L
Linus Torvalds 已提交
22 23 24
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
25
#include <linux/pci.h>
26
#include <linux/pfn.h>
27
#include <linux/poison.h>
28
#include <linux/dma-mapping.h>
29 30
#include <linux/module.h>
#include <linux/memory_hotplug.h>
31
#include <linux/nmi.h>
L
Linus Torvalds 已提交
32 33 34 35 36 37 38 39 40 41 42 43 44 45

#include <asm/processor.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
46
#include <asm/sections.h>
47
#include <asm/kdebug.h>
48
#include <asm/numa.h>
49
#include <asm/cacheflush.h>
L
Linus Torvalds 已提交
50

51 52 53 54 55 56 57
/*
 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
 * The direct mapping extends to max_pfn_mapped, so that we can directly access
 * apertures, ACPI and other tables without having to play with fixmaps.
 */
unsigned long max_pfn_mapped;

58 59
static unsigned long dma_reserve __initdata;

L
Linus Torvalds 已提交
60 61
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);

I
Ingo Molnar 已提交
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
int direct_gbpages __meminitdata
#ifdef CONFIG_DIRECT_GBPAGES
				= 1
#endif
;

static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
82 83 84 85 86 87 88 89
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

void show_mem(void)
{
90 91
	long i, total = 0, reserved = 0;
	long shared = 0, cached = 0;
L
Linus Torvalds 已提交
92
	struct page *page;
T
Thomas Gleixner 已提交
93
	pg_data_t *pgdat;
L
Linus Torvalds 已提交
94

95
	printk(KERN_INFO "Mem-info:\n");
L
Linus Torvalds 已提交
96
	show_free_areas();
97
	for_each_online_pgdat(pgdat) {
T
Thomas Gleixner 已提交
98 99 100 101 102 103
		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
			/*
			 * This loop can take a while with 256 GB and
			 * 4k pages so defer the NMI watchdog:
			 */
			if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
104
				touch_nmi_watchdog();
T
Thomas Gleixner 已提交
105

B
Bob Picco 已提交
106 107
			if (!pfn_valid(pgdat->node_start_pfn + i))
				continue;
T
Thomas Gleixner 已提交
108

L
Linus Torvalds 已提交
109 110
			page = pfn_to_page(pgdat->node_start_pfn + i);
			total++;
111 112 113 114 115 116
			if (PageReserved(page))
				reserved++;
			else if (PageSwapCache(page))
				cached++;
			else if (page_count(page))
				shared += page_count(page) - 1;
T
Thomas Gleixner 已提交
117
		}
L
Linus Torvalds 已提交
118
	}
T
Thomas Gleixner 已提交
119 120 121 122
	printk(KERN_INFO "%lu pages of RAM\n",		total);
	printk(KERN_INFO "%lu reserved pages\n",	reserved);
	printk(KERN_INFO "%lu pages shared\n",		shared);
	printk(KERN_INFO "%lu pages swap cached\n",	cached);
L
Linus Torvalds 已提交
123 124 125 126
}

int after_bootmem;

127
static __init void *spp_getpage(void)
T
Thomas Gleixner 已提交
128
{
L
Linus Torvalds 已提交
129
	void *ptr;
T
Thomas Gleixner 已提交
130

L
Linus Torvalds 已提交
131
	if (after_bootmem)
T
Thomas Gleixner 已提交
132
		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
L
Linus Torvalds 已提交
133 134
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
135 136 137 138 139

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
140

141
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
142

L
Linus Torvalds 已提交
143
	return ptr;
T
Thomas Gleixner 已提交
144
}
L
Linus Torvalds 已提交
145

146 147
void
set_pte_vaddr(unsigned long vaddr, pte_t new_pte)
L
Linus Torvalds 已提交
148 149 150 151
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
152
	pte_t *pte;
L
Linus Torvalds 已提交
153

154
	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(new_pte));
L
Linus Torvalds 已提交
155 156 157

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
158 159
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
L
Linus Torvalds 已提交
160 161 162 163
		return;
	}
	pud = pud_offset(pgd, vaddr);
	if (pud_none(*pud)) {
T
Thomas Gleixner 已提交
164
		pmd = (pmd_t *) spp_getpage();
165
		pud_populate(&init_mm, pud, pmd);
L
Linus Torvalds 已提交
166
		if (pmd != pmd_offset(pud, 0)) {
167
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
T
Thomas Gleixner 已提交
168
				pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
169 170 171 172 173 174
			return;
		}
	}
	pmd = pmd_offset(pud, vaddr);
	if (pmd_none(*pmd)) {
		pte = (pte_t *) spp_getpage();
175
		pmd_populate_kernel(&init_mm, pmd, pte);
L
Linus Torvalds 已提交
176
		if (pte != pte_offset_kernel(pmd, 0)) {
177
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
178 179 180 181 182
			return;
		}
	}

	pte = pte_offset_kernel(pmd, vaddr);
I
Ingo Molnar 已提交
183
	if (!pte_none(*pte) && pte_val(new_pte) &&
L
Linus Torvalds 已提交
184 185 186 187 188 189 190 191 192 193 194
	    pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
		pte_ERROR(*pte);
	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

195
/*
196 197 198
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
 * We limit the mappings to the region from _text to _end.  _end is
 * rounded up to the 2MB boundary. This catches the invalid pmds as
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
	unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
	pmd_t *pmd = level2_kernel_pgt;
	pmd_t *last_pmd = pmd + PTRS_PER_PMD;

	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
216
		if (pmd_none(*pmd))
217 218 219 220 221 222
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

223 224
static unsigned long __initdata table_start;
static unsigned long __meminitdata table_end;
225
static unsigned long __meminitdata table_top;
L
Linus Torvalds 已提交
226

227
static __meminit void *alloc_low_page(unsigned long *phys)
T
Thomas Gleixner 已提交
228
{
229
	unsigned long pfn = table_end++;
L
Linus Torvalds 已提交
230 231
	void *adr;

232 233 234
	if (after_bootmem) {
		adr = (void *)get_zeroed_page(GFP_ATOMIC);
		*phys = __pa(adr);
T
Thomas Gleixner 已提交
235

236 237 238
		return adr;
	}

239
	if (pfn >= table_top)
T
Thomas Gleixner 已提交
240
		panic("alloc_low_page: ran out of memory");
241 242

	adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
243
	memset(adr, 0, PAGE_SIZE);
244 245 246
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
L
Linus Torvalds 已提交
247

248
static __meminit void unmap_low_page(void *adr)
T
Thomas Gleixner 已提交
249
{
250 251 252
	if (after_bootmem)
		return;

253
	early_iounmap(adr, PAGE_SIZE);
T
Thomas Gleixner 已提交
254
}
L
Linus Torvalds 已提交
255

256
static unsigned long __meminit
257
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
258
{
259 260
	unsigned long pages = 0;

261
	int i = pmd_index(address);
262

263 264
	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
		pmd_t *pmd = pmd_page + pmd_index(address);
265

266
		if (address >= end) {
T
Thomas Gleixner 已提交
267
			if (!after_bootmem) {
268 269
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
T
Thomas Gleixner 已提交
270
			}
271 272
			break;
		}
273 274 275 276

		if (pmd_val(*pmd))
			continue;

277
		pages++;
278 279
		set_pte((pte_t *)pmd,
			pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
280
	}
281
	update_page_count(PG_LEVEL_2M, pages);
282
	return address;
283 284
}

285
static unsigned long __meminit
286 287
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
{
T
Thomas Gleixner 已提交
288
	pmd_t *pmd = pmd_offset(pud, 0);
289 290
	unsigned long last_map_addr;

291
	spin_lock(&init_mm.page_table_lock);
292
	last_map_addr = phys_pmd_init(pmd, address, end);
293 294
	spin_unlock(&init_mm.page_table_lock);
	__flush_tlb_all();
295
	return last_map_addr;
296 297
}

298
static unsigned long __meminit
T
Thomas Gleixner 已提交
299 300
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
{
301
	unsigned long pages = 0;
302
	unsigned long last_map_addr = end;
303
	int i = pud_index(addr);
304

T
Thomas Gleixner 已提交
305
	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
306 307
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
308 309
		pmd_t *pmd;

310
		if (addr >= end)
L
Linus Torvalds 已提交
311 312
			break;

T
Thomas Gleixner 已提交
313 314 315
		if (!after_bootmem &&
				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
			set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
316
			continue;
T
Thomas Gleixner 已提交
317
		}
L
Linus Torvalds 已提交
318

319
		if (pud_val(*pud)) {
320
			if (!pud_large(*pud))
321
				last_map_addr = phys_pmd_update(pud, addr, end);
322 323 324 325
			continue;
		}

		if (direct_gbpages) {
326
			pages++;
327 328
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
329
			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
330 331 332
			continue;
		}

333
		pmd = alloc_low_page(&pmd_phys);
T
Thomas Gleixner 已提交
334

335
		spin_lock(&init_mm.page_table_lock);
336
		pud_populate(&init_mm, pud, __va(pmd_phys));
337
		last_map_addr = phys_pmd_init(pmd, addr, end);
338
		spin_unlock(&init_mm.page_table_lock);
T
Thomas Gleixner 已提交
339

340
		unmap_low_page(pmd);
L
Linus Torvalds 已提交
341
	}
A
Andi Kleen 已提交
342
	__flush_tlb_all();
343
	update_page_count(PG_LEVEL_1G, pages);
344

345
	return last_map_addr;
T
Thomas Gleixner 已提交
346
}
L
Linus Torvalds 已提交
347 348 349

static void __init find_early_table_space(unsigned long end)
{
350
	unsigned long puds, pmds, tables, start;
L
Linus Torvalds 已提交
351 352

	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
353 354 355 356 357
	tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
	if (!direct_gbpages) {
		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
		tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
	}
L
Linus Torvalds 已提交
358

T
Thomas Gleixner 已提交
359 360 361 362 363 364
	/*
	 * RED-PEN putting page tables only on node 0 could
	 * cause a hotspot and fill up ZONE_DMA. The page tables
	 * need roughly 0.5KB per GB.
	 */
	start = 0x8000;
365
	table_start = find_e820_area(start, end, tables, PAGE_SIZE);
L
Linus Torvalds 已提交
366 367 368 369 370
	if (table_start == -1UL)
		panic("Cannot find space for the kernel page tables");

	table_start >>= PAGE_SHIFT;
	table_end = table_start;
371
	table_top = table_start + (tables >> PAGE_SHIFT);
372

373 374
	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
		end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
L
Linus Torvalds 已提交
375 376
}

377 378 379 380 381 382 383 384
static void __init init_gbpages(void)
{
	if (direct_gbpages && cpu_has_gbpages)
		printk(KERN_INFO "Using GB pages for direct mapping\n");
	else
		direct_gbpages = 0;
}

385
#ifdef CONFIG_MEMTEST
Y
Yinghai Lu 已提交
386 387 388

static void __init memtest(unsigned long start_phys, unsigned long size,
				 unsigned pattern)
Y
Yinghai Lu 已提交
389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430
{
	unsigned long i;
	unsigned long *start;
	unsigned long start_bad;
	unsigned long last_bad;
	unsigned long val;
	unsigned long start_phys_aligned;
	unsigned long count;
	unsigned long incr;

	switch (pattern) {
	case 0:
		val = 0UL;
		break;
	case 1:
		val = -1UL;
		break;
	case 2:
		val = 0x5555555555555555UL;
		break;
	case 3:
		val = 0xaaaaaaaaaaaaaaaaUL;
		break;
	default:
		return;
	}

	incr = sizeof(unsigned long);
	start_phys_aligned = ALIGN(start_phys, incr);
	count = (size - (start_phys_aligned - start_phys))/incr;
	start = __va(start_phys_aligned);
	start_bad = 0;
	last_bad = 0;

	for (i = 0; i < count; i++)
		start[i] = val;
	for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
		if (*start != val) {
			if (start_phys_aligned == last_bad + incr) {
				last_bad += incr;
			} else {
				if (start_bad) {
Y
Yinghai Lu 已提交
431
					printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
Y
Yinghai Lu 已提交
432 433 434 435 436 437 438 439
						val, start_bad, last_bad + incr);
					reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
				}
				start_bad = last_bad = start_phys_aligned;
			}
		}
	}
	if (start_bad) {
Y
Yinghai Lu 已提交
440
		printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
Y
Yinghai Lu 已提交
441 442 443 444 445 446
			val, start_bad, last_bad + incr);
		reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
	}

}

447 448
/* default is disabled */
static int memtest_pattern __initdata;
Y
Yinghai Lu 已提交
449

Y
Yinghai Lu 已提交
450 451 452
static int __init parse_memtest(char *arg)
{
	if (arg)
Y
Yinghai Lu 已提交
453
		memtest_pattern = simple_strtoul(arg, NULL, 0);
Y
Yinghai Lu 已提交
454 455 456 457 458 459 460
	return 0;
}

early_param("memtest", parse_memtest);

static void __init early_memtest(unsigned long start, unsigned long end)
{
461
	u64 t_start, t_size;
Y
Yinghai Lu 已提交
462 463
	unsigned pattern;

Y
Yinghai Lu 已提交
464 465 466 467
	if (!memtest_pattern)
		return;

	printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
Y
Yinghai Lu 已提交
468 469 470 471 472 473 474 475 476 477 478 479
	for (pattern = 0; pattern < memtest_pattern; pattern++) {
		t_start = start;
		t_size = 0;
		while (t_start < end) {
			t_start = find_e820_area_size(t_start, &t_size, 1);

			/* done ? */
			if (t_start >= end)
				break;
			if (t_start + t_size > end)
				t_size = end - t_start;

480 481 482
			printk(KERN_CONT "\n  %016llx - %016llx pattern %d",
				(unsigned long long)t_start,
				(unsigned long long)t_start + t_size, pattern);
Y
Yinghai Lu 已提交
483 484 485 486 487 488

			memtest(t_start, t_size, pattern);

			t_start += t_size;
		}
	}
Y
Yinghai Lu 已提交
489
	printk(KERN_CONT "\n");
Y
Yinghai Lu 已提交
490
}
Y
Yinghai Lu 已提交
491 492 493 494 495
#else
static void __init early_memtest(unsigned long start, unsigned long end)
{
}
#endif
Y
Yinghai Lu 已提交
496

T
Thomas Gleixner 已提交
497 498 499 500 501
/*
 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
 * This runs before bootmem is initialized and gets pages directly from
 * the physical memory. To access them they are temporarily mapped.
 */
502
unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
T
Thomas Gleixner 已提交
503
{
504
	unsigned long next, last_map_addr = end;
Y
Yinghai Lu 已提交
505
	unsigned long start_phys = start, end_phys = end;
L
Linus Torvalds 已提交
506

Y
Yinghai Lu 已提交
507
	printk(KERN_INFO "init_memory_mapping\n");
L
Linus Torvalds 已提交
508

T
Thomas Gleixner 已提交
509
	/*
L
Linus Torvalds 已提交
510
	 * Find space for the kernel direct mapping tables.
T
Thomas Gleixner 已提交
511 512 513 514
	 *
	 * Later we should allocate these tables in the local node of the
	 * memory mapped. Unfortunately this is done currently before the
	 * nodes are discovered.
L
Linus Torvalds 已提交
515
	 */
516 517
	if (!after_bootmem) {
		init_gbpages();
518
		find_early_table_space(end);
519
	}
L
Linus Torvalds 已提交
520 521 522 523 524

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);

	for (; start < end; start = next) {
525
		pgd_t *pgd = pgd_offset_k(start);
T
Thomas Gleixner 已提交
526
		unsigned long pud_phys;
527 528 529
		pud_t *pud;

		if (after_bootmem)
530
			pud = pud_offset(pgd, start & PGDIR_MASK);
531
		else
532
			pud = alloc_low_page(&pud_phys);
533

L
Linus Torvalds 已提交
534
		next = start + PGDIR_SIZE;
T
Thomas Gleixner 已提交
535 536
		if (next > end)
			next = end;
537
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
538
		if (!after_bootmem)
539 540
			pgd_populate(&init_mm, pgd_offset_k(start),
				     __va(pud_phys));
541
		unmap_low_page(pud);
T
Thomas Gleixner 已提交
542
	}
L
Linus Torvalds 已提交
543

544
	if (!after_bootmem)
545
		mmu_cr4_features = read_cr4();
L
Linus Torvalds 已提交
546
	__flush_tlb_all();
547

548 549 550
	if (!after_bootmem)
		reserve_early(table_start << PAGE_SHIFT,
				 table_end << PAGE_SHIFT, "PGTABLE");
Y
Yinghai Lu 已提交
551 552 553

	if (!after_bootmem)
		early_memtest(start_phys, end_phys);
554

555
	return last_map_addr >> PAGE_SHIFT;
L
Linus Torvalds 已提交
556 557
}

558
#ifndef CONFIG_NUMA
559 560 561 562 563 564 565 566 567
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long bootmap_size, bootmap;

	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
				 PAGE_SIZE);
	if (bootmap == -1L)
		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
Y
Yinghai Lu 已提交
568 569 570
	/* don't touch min_low_pfn */
	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
					 0, end_pfn);
571 572 573 574 575 576
	e820_register_active_regions(0, start_pfn, end_pfn);
	free_bootmem_with_active_regions(0, end_pfn);
	early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
	reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
}

L
Linus Torvalds 已提交
577 578
void __init paging_init(void)
{
579
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
580

581 582 583
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
Y
Yinghai Lu 已提交
584
	max_zone_pfns[ZONE_NORMAL] = max_pfn;
585

Y
Yinghai Lu 已提交
586
	memory_present(0, 0, max_pfn);
587
	sparse_init();
588
	free_area_init_nodes(max_zone_pfns);
L
Linus Torvalds 已提交
589 590 591
}
#endif

592 593 594
/*
 * Memory hotplug specific functions
 */
595
#ifdef CONFIG_MEMORY_HOTPLUG
596 597 598 599
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
600
int arch_add_memory(int nid, u64 start, u64 size)
601
{
602
	struct pglist_data *pgdat = NODE_DATA(nid);
603
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
604
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
605 606 607
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

608 609 610
	last_mapped_pfn = init_memory_mapping(start, start + size-1);
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
611

612
	ret = __add_pages(zone, start_pfn, nr_pages);
613
	WARN_ON(1);
614 615 616

	return ret;
}
617
EXPORT_SYMBOL_GPL(arch_add_memory);
618

619
#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
620 621 622 623
int memory_add_physaddr_to_nid(u64 start)
{
	return 0;
}
624
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
625 626
#endif

627 628
#endif /* CONFIG_MEMORY_HOTPLUG */

629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648
/*
 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
 * is valid. The argument is a physical page number.
 *
 *
 * On x86, access has to be given to the first megabyte of ram because that area
 * contains bios code and data regions used by X and dosemu and similar apps.
 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
 * mmio resources as well as potential bios/acpi data regions.
 */
int devmem_is_allowed(unsigned long pagenr)
{
	if (pagenr <= 256)
		return 1;
	if (!page_is_ram(pagenr))
		return 1;
	return 0;
}


T
Thomas Gleixner 已提交
649 650
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
			 kcore_modules, kcore_vsyscall;
L
Linus Torvalds 已提交
651 652 653

void __init mem_init(void)
{
654
	long codesize, reservedpages, datasize, initsize;
L
Linus Torvalds 已提交
655

656
	pci_iommu_alloc();
L
Linus Torvalds 已提交
657

658
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
659 660 661 662

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
663
#ifdef CONFIG_NUMA
664
	totalram_pages = numa_free_all_bootmem();
L
Linus Torvalds 已提交
665
#else
666
	totalram_pages = free_all_bootmem();
L
Linus Torvalds 已提交
667
#endif
Y
Yinghai Lu 已提交
668 669
	reservedpages = max_pfn - totalram_pages -
					absent_pages_in_range(0, max_pfn);
L
Linus Torvalds 已提交
670 671 672 673 674 675 676
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
677 678
	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
L
Linus Torvalds 已提交
679 680 681
		   VMALLOC_END-VMALLOC_START);
	kclist_add(&kcore_kernel, &_stext, _end - _stext);
	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
T
Thomas Gleixner 已提交
682
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
L
Linus Torvalds 已提交
683 684
				 VSYSCALL_END - VSYSCALL_START);

685
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
T
Thomas Gleixner 已提交
686
				"%ldk reserved, %ldk data, %ldk init)\n",
L
Linus Torvalds 已提交
687
		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
Y
Yinghai Lu 已提交
688
		max_pfn << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
689 690 691 692
		codesize >> 10,
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
693 694

	cpa_init();
L
Linus Torvalds 已提交
695 696
}

697
void free_init_pages(char *what, unsigned long begin, unsigned long end)
L
Linus Torvalds 已提交
698
{
699
	unsigned long addr = begin;
L
Linus Torvalds 已提交
700

701
	if (addr >= end)
702 703
		return;

I
Ingo Molnar 已提交
704 705 706 707 708 709 710 711 712 713
	/*
	 * If debugging page accesses then do not free this memory but
	 * mark them not present - any buggy init-section access will
	 * create a kernel page fault:
	 */
#ifdef CONFIG_DEBUG_PAGEALLOC
	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
		begin, PAGE_ALIGN(end));
	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
#else
714
	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
T
Thomas Gleixner 已提交
715

716
	for (; addr < end; addr += PAGE_SIZE) {
717 718 719 720 721
		ClearPageReserved(virt_to_page(addr));
		init_page_count(virt_to_page(addr));
		memset((void *)(addr & ~(PAGE_SIZE-1)),
			POISON_FREE_INITMEM, PAGE_SIZE);
		free_page(addr);
L
Linus Torvalds 已提交
722 723
		totalram_pages++;
	}
I
Ingo Molnar 已提交
724
#endif
725 726 727 728 729
}

void free_initmem(void)
{
	free_init_pages("unused kernel memory",
730 731
			(unsigned long)(&__init_begin),
			(unsigned long)(&__init_end));
L
Linus Torvalds 已提交
732 733
}

734
#ifdef CONFIG_DEBUG_RODATA
735 736
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
737 738 739

void mark_rodata_ro(void)
{
740
	unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
741

742
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
743
	       (end - start) >> 10);
744 745 746 747 748 749 750 751
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
	start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
	set_memory_nx(start, (end - start) >> PAGE_SHIFT);
752

753 754
	rodata_test();

755
#ifdef CONFIG_CPA_DEBUG
756
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
757
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
758

759
	printk(KERN_INFO "Testing CPA: again\n");
760
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
761
#endif
762
}
763

764 765
#endif

L
Linus Torvalds 已提交
766 767 768
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
769
	free_init_pages("initrd memory", start, end);
L
Linus Torvalds 已提交
770 771 772
}
#endif

773 774
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
				   int flags)
T
Thomas Gleixner 已提交
775
{
776
#ifdef CONFIG_NUMA
777
	int nid, next_nid;
778
	int ret;
779 780
#endif
	unsigned long pfn = phys >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
781

Y
Yinghai Lu 已提交
782
	if (pfn >= max_pfn) {
T
Thomas Gleixner 已提交
783 784 785 786
		/*
		 * This can happen with kdump kernels when accessing
		 * firmware tables:
		 */
787
		if (pfn < max_pfn_mapped)
788
			return -EFAULT;
T
Thomas Gleixner 已提交
789

790
		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
791
				phys, len);
792
		return -EFAULT;
793 794 795 796
	}

	/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
797 798 799
	nid = phys_to_nid(phys);
	next_nid = phys_to_nid(phys + len - 1);
	if (nid == next_nid)
800
		ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
801
	else
802 803 804 805 806
		ret = reserve_bootmem(phys, len, flags);

	if (ret != 0)
		return ret;

T
Thomas Gleixner 已提交
807
#else
808
	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
L
Linus Torvalds 已提交
809
#endif
810

811
	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
812
		dma_reserve += len / PAGE_SIZE;
813 814
		set_dma_reserve(dma_reserve);
	}
815 816

	return 0;
L
Linus Torvalds 已提交
817 818
}

T
Thomas Gleixner 已提交
819 820
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
821
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
822 823 824 825
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
826 827

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
828 829
		return 0;

L
Linus Torvalds 已提交
830 831 832 833 834 835
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
836
		return 0;
L
Linus Torvalds 已提交
837 838 839 840

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
841

L
Linus Torvalds 已提交
842 843 844 845 846 847
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
848

L
Linus Torvalds 已提交
849 850 851
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
852 853 854 855 856
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
857
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
858 859 860 861
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
862 863 864 865 866
};

struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
#ifdef CONFIG_IA32_EMULATION
867 868
	if (test_tsk_thread_flag(tsk, TIF_IA32))
		return NULL;
L
Linus Torvalds 已提交
869 870 871 872 873 874 875
#endif
	return &gate_vma;
}

int in_gate_area(struct task_struct *task, unsigned long addr)
{
	struct vm_area_struct *vma = get_gate_vma(task);
T
Thomas Gleixner 已提交
876

877 878
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
879

L
Linus Torvalds 已提交
880 881 882
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
883 884 885 886
/*
 * Use this when you have no reliable task/vma, typically from interrupt
 * context. It is less reliable than using the task's vma and may give
 * false positives:
L
Linus Torvalds 已提交
887 888 889
 */
int in_gate_area_no_task(unsigned long addr)
{
890
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
891
}
892

893 894 895 896 897 898 899 900
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
901 902 903 904 905

#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
906 907 908 909
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

T
Thomas Gleixner 已提交
910 911
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
912 913 914 915 916 917 918 919 920 921 922 923 924 925
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
		next = pmd_addr_end(addr, end);

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
926

927 928 929 930 931 932 933
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

		pmd = pmd_offset(pud, addr);
		if (pmd_none(*pmd)) {
			pte_t entry;
T
Thomas Gleixner 已提交
934 935 936
			void *p;

			p = vmemmap_alloc_block(PMD_SIZE, node);
937 938 939
			if (!p)
				return -ENOMEM;

T
Thomas Gleixner 已提交
940 941
			entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
							PAGE_KERNEL_LARGE);
942 943
			set_pmd(pmd, __pmd(pte_val(entry)));

944 945 946 947 948 949 950 951 952 953 954
			/* check to see if we have contiguous blocks */
			if (p_end != p || node_start != node) {
				if (p_start)
					printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						addr_start, addr_end-1, p_start, p_end-1, node_start);
				addr_start = addr;
				node_start = node;
				p_start = p;
			}
			addr_end = addr + PMD_SIZE;
			p_end = p + PMD_SIZE;
T
Thomas Gleixner 已提交
955
		} else {
956
			vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
957
		}
958 959 960
	}
	return 0;
}
961 962 963 964 965 966 967 968 969 970 971

void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
972
#endif