init_64.c 23.1 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
T
Thomas Gleixner 已提交
21
#include <linux/initrd.h>
L
Linus Torvalds 已提交
22 23 24
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
25
#include <linux/pci.h>
26
#include <linux/pfn.h>
27
#include <linux/poison.h>
28
#include <linux/dma-mapping.h>
29 30
#include <linux/module.h>
#include <linux/memory_hotplug.h>
31
#include <linux/nmi.h>
L
Linus Torvalds 已提交
32 33 34 35 36 37 38 39 40 41 42 43 44 45

#include <asm/processor.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
46
#include <asm/sections.h>
47
#include <asm/kdebug.h>
48
#include <asm/numa.h>
49
#include <asm/cacheflush.h>
L
Linus Torvalds 已提交
50

51 52 53 54 55 56 57 58 59 60 61 62
/*
 * PFN of last memory page.
 */
unsigned long end_pfn;

/*
 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
 * The direct mapping extends to max_pfn_mapped, so that we can directly access
 * apertures, ACPI and other tables without having to play with fixmaps.
 */
unsigned long max_pfn_mapped;

63 64
static unsigned long dma_reserve __initdata;

L
Linus Torvalds 已提交
65 66
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);

I
Ingo Molnar 已提交
67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
int direct_gbpages __meminitdata
#ifdef CONFIG_DIRECT_GBPAGES
				= 1
#endif
;

static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
87 88 89 90 91 92 93 94
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

void show_mem(void)
{
95 96
	long i, total = 0, reserved = 0;
	long shared = 0, cached = 0;
L
Linus Torvalds 已提交
97
	struct page *page;
T
Thomas Gleixner 已提交
98
	pg_data_t *pgdat;
L
Linus Torvalds 已提交
99

100
	printk(KERN_INFO "Mem-info:\n");
L
Linus Torvalds 已提交
101
	show_free_areas();
102
	for_each_online_pgdat(pgdat) {
T
Thomas Gleixner 已提交
103 104 105 106 107 108
		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
			/*
			 * This loop can take a while with 256 GB and
			 * 4k pages so defer the NMI watchdog:
			 */
			if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
109
				touch_nmi_watchdog();
T
Thomas Gleixner 已提交
110

B
Bob Picco 已提交
111 112
			if (!pfn_valid(pgdat->node_start_pfn + i))
				continue;
T
Thomas Gleixner 已提交
113

L
Linus Torvalds 已提交
114 115
			page = pfn_to_page(pgdat->node_start_pfn + i);
			total++;
116 117 118 119 120 121
			if (PageReserved(page))
				reserved++;
			else if (PageSwapCache(page))
				cached++;
			else if (page_count(page))
				shared += page_count(page) - 1;
T
Thomas Gleixner 已提交
122
		}
L
Linus Torvalds 已提交
123
	}
T
Thomas Gleixner 已提交
124 125 126 127
	printk(KERN_INFO "%lu pages of RAM\n",		total);
	printk(KERN_INFO "%lu reserved pages\n",	reserved);
	printk(KERN_INFO "%lu pages shared\n",		shared);
	printk(KERN_INFO "%lu pages swap cached\n",	cached);
L
Linus Torvalds 已提交
128 129 130 131
}

int after_bootmem;

132
static __init void *spp_getpage(void)
T
Thomas Gleixner 已提交
133
{
L
Linus Torvalds 已提交
134
	void *ptr;
T
Thomas Gleixner 已提交
135

L
Linus Torvalds 已提交
136
	if (after_bootmem)
T
Thomas Gleixner 已提交
137
		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
L
Linus Torvalds 已提交
138 139
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
140 141 142 143 144

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
145

146
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
147

L
Linus Torvalds 已提交
148
	return ptr;
T
Thomas Gleixner 已提交
149
}
L
Linus Torvalds 已提交
150

151 152
void
set_pte_vaddr(unsigned long vaddr, pte_t new_pte)
L
Linus Torvalds 已提交
153 154 155 156
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
157
	pte_t *pte;
L
Linus Torvalds 已提交
158

159
	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(new_pte));
L
Linus Torvalds 已提交
160 161 162

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
163 164
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
L
Linus Torvalds 已提交
165 166 167 168
		return;
	}
	pud = pud_offset(pgd, vaddr);
	if (pud_none(*pud)) {
T
Thomas Gleixner 已提交
169
		pmd = (pmd_t *) spp_getpage();
170
		pud_populate(&init_mm, pud, pmd);
L
Linus Torvalds 已提交
171
		if (pmd != pmd_offset(pud, 0)) {
172
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
T
Thomas Gleixner 已提交
173
				pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
174 175 176 177 178 179
			return;
		}
	}
	pmd = pmd_offset(pud, vaddr);
	if (pmd_none(*pmd)) {
		pte = (pte_t *) spp_getpage();
180
		pmd_populate_kernel(&init_mm, pmd, pte);
L
Linus Torvalds 已提交
181
		if (pte != pte_offset_kernel(pmd, 0)) {
182
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
183 184 185 186 187
			return;
		}
	}

	pte = pte_offset_kernel(pmd, vaddr);
I
Ingo Molnar 已提交
188
	if (!pte_none(*pte) && pte_val(new_pte) &&
L
Linus Torvalds 已提交
189 190 191 192 193 194 195 196 197 198 199
	    pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
		pte_ERROR(*pte);
	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

200
/*
201 202 203
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
 * We limit the mappings to the region from _text to _end.  _end is
 * rounded up to the 2MB boundary. This catches the invalid pmds as
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
	unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
	pmd_t *pmd = level2_kernel_pgt;
	pmd_t *last_pmd = pmd + PTRS_PER_PMD;

	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
221
		if (pmd_none(*pmd))
222 223 224 225 226 227
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

228 229
static unsigned long __initdata table_start;
static unsigned long __meminitdata table_end;
L
Linus Torvalds 已提交
230

231
static __meminit void *alloc_low_page(unsigned long *phys)
T
Thomas Gleixner 已提交
232
{
233
	unsigned long pfn = table_end++;
L
Linus Torvalds 已提交
234 235
	void *adr;

236 237 238
	if (after_bootmem) {
		adr = (void *)get_zeroed_page(GFP_ATOMIC);
		*phys = __pa(adr);
T
Thomas Gleixner 已提交
239

240 241 242
		return adr;
	}

T
Thomas Gleixner 已提交
243 244
	if (pfn >= end_pfn)
		panic("alloc_low_page: ran out of memory");
245 246

	adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
247
	memset(adr, 0, PAGE_SIZE);
248 249 250
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
L
Linus Torvalds 已提交
251

252
static __meminit void unmap_low_page(void *adr)
T
Thomas Gleixner 已提交
253
{
254 255 256
	if (after_bootmem)
		return;

257
	early_iounmap(adr, PAGE_SIZE);
T
Thomas Gleixner 已提交
258
}
L
Linus Torvalds 已提交
259

260
static unsigned long __meminit
261
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
262
{
263 264
	unsigned long pages = 0;

265
	int i = pmd_index(address);
266

267 268
	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
		pmd_t *pmd = pmd_page + pmd_index(address);
269

270
		if (address >= end) {
T
Thomas Gleixner 已提交
271
			if (!after_bootmem) {
272 273
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
T
Thomas Gleixner 已提交
274
			}
275 276
			break;
		}
277 278 279 280

		if (pmd_val(*pmd))
			continue;

281
		pages++;
282 283
		set_pte((pte_t *)pmd,
			pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
284
	}
285
	update_page_count(PG_LEVEL_2M, pages);
286
	return address;
287 288
}

289
static unsigned long __meminit
290 291
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
{
T
Thomas Gleixner 已提交
292
	pmd_t *pmd = pmd_offset(pud, 0);
293 294
	unsigned long last_map_addr;

295
	spin_lock(&init_mm.page_table_lock);
296
	last_map_addr = phys_pmd_init(pmd, address, end);
297 298
	spin_unlock(&init_mm.page_table_lock);
	__flush_tlb_all();
299
	return last_map_addr;
300 301
}

302
static unsigned long __meminit
T
Thomas Gleixner 已提交
303 304
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
{
305
	unsigned long pages = 0;
306
	unsigned long last_map_addr = end;
307
	int i = pud_index(addr);
308

T
Thomas Gleixner 已提交
309
	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
310 311
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
312 313
		pmd_t *pmd;

314
		if (addr >= end)
L
Linus Torvalds 已提交
315 316
			break;

T
Thomas Gleixner 已提交
317 318 319
		if (!after_bootmem &&
				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
			set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
320
			continue;
T
Thomas Gleixner 已提交
321
		}
L
Linus Torvalds 已提交
322

323
		if (pud_val(*pud)) {
324
			if (!pud_large(*pud))
325
				last_map_addr = phys_pmd_update(pud, addr, end);
326 327 328 329
			continue;
		}

		if (direct_gbpages) {
330
			pages++;
331 332
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
333
			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
334 335 336
			continue;
		}

337
		pmd = alloc_low_page(&pmd_phys);
T
Thomas Gleixner 已提交
338

339
		spin_lock(&init_mm.page_table_lock);
340
		pud_populate(&init_mm, pud, __va(pmd_phys));
341
		last_map_addr = phys_pmd_init(pmd, addr, end);
342
		spin_unlock(&init_mm.page_table_lock);
T
Thomas Gleixner 已提交
343

344
		unmap_low_page(pmd);
L
Linus Torvalds 已提交
345
	}
A
Andi Kleen 已提交
346
	__flush_tlb_all();
347
	update_page_count(PG_LEVEL_1G, pages);
348

349
	return last_map_addr;
T
Thomas Gleixner 已提交
350
}
L
Linus Torvalds 已提交
351 352 353

static void __init find_early_table_space(unsigned long end)
{
354
	unsigned long puds, pmds, tables, start;
L
Linus Torvalds 已提交
355 356

	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
357 358 359 360 361
	tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
	if (!direct_gbpages) {
		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
		tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
	}
L
Linus Torvalds 已提交
362

T
Thomas Gleixner 已提交
363 364 365 366 367 368
	/*
	 * RED-PEN putting page tables only on node 0 could
	 * cause a hotspot and fill up ZONE_DMA. The page tables
	 * need roughly 0.5KB per GB.
	 */
	start = 0x8000;
369
	table_start = find_e820_area(start, end, tables, PAGE_SIZE);
L
Linus Torvalds 已提交
370 371 372 373 374
	if (table_start == -1UL)
		panic("Cannot find space for the kernel page tables");

	table_start >>= PAGE_SHIFT;
	table_end = table_start;
375 376

	early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
377 378
		end, table_start << PAGE_SHIFT,
		(table_start << PAGE_SHIFT) + tables);
L
Linus Torvalds 已提交
379 380
}

381 382 383 384 385 386 387 388
static void __init init_gbpages(void)
{
	if (direct_gbpages && cpu_has_gbpages)
		printk(KERN_INFO "Using GB pages for direct mapping\n");
	else
		direct_gbpages = 0;
}

389
#ifdef CONFIG_MEMTEST
Y
Yinghai Lu 已提交
390 391 392

static void __init memtest(unsigned long start_phys, unsigned long size,
				 unsigned pattern)
Y
Yinghai Lu 已提交
393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434
{
	unsigned long i;
	unsigned long *start;
	unsigned long start_bad;
	unsigned long last_bad;
	unsigned long val;
	unsigned long start_phys_aligned;
	unsigned long count;
	unsigned long incr;

	switch (pattern) {
	case 0:
		val = 0UL;
		break;
	case 1:
		val = -1UL;
		break;
	case 2:
		val = 0x5555555555555555UL;
		break;
	case 3:
		val = 0xaaaaaaaaaaaaaaaaUL;
		break;
	default:
		return;
	}

	incr = sizeof(unsigned long);
	start_phys_aligned = ALIGN(start_phys, incr);
	count = (size - (start_phys_aligned - start_phys))/incr;
	start = __va(start_phys_aligned);
	start_bad = 0;
	last_bad = 0;

	for (i = 0; i < count; i++)
		start[i] = val;
	for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
		if (*start != val) {
			if (start_phys_aligned == last_bad + incr) {
				last_bad += incr;
			} else {
				if (start_bad) {
Y
Yinghai Lu 已提交
435
					printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
Y
Yinghai Lu 已提交
436 437 438 439 440 441 442 443
						val, start_bad, last_bad + incr);
					reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
				}
				start_bad = last_bad = start_phys_aligned;
			}
		}
	}
	if (start_bad) {
Y
Yinghai Lu 已提交
444
		printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
Y
Yinghai Lu 已提交
445 446 447 448 449 450
			val, start_bad, last_bad + incr);
		reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
	}

}

451 452
/* default is disabled */
static int memtest_pattern __initdata;
Y
Yinghai Lu 已提交
453

Y
Yinghai Lu 已提交
454 455 456
static int __init parse_memtest(char *arg)
{
	if (arg)
Y
Yinghai Lu 已提交
457
		memtest_pattern = simple_strtoul(arg, NULL, 0);
Y
Yinghai Lu 已提交
458 459 460 461 462 463 464
	return 0;
}

early_param("memtest", parse_memtest);

static void __init early_memtest(unsigned long start, unsigned long end)
{
465
	u64 t_start, t_size;
Y
Yinghai Lu 已提交
466 467
	unsigned pattern;

Y
Yinghai Lu 已提交
468 469 470 471
	if (!memtest_pattern)
		return;

	printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
Y
Yinghai Lu 已提交
472 473 474 475 476 477 478 479 480 481 482 483
	for (pattern = 0; pattern < memtest_pattern; pattern++) {
		t_start = start;
		t_size = 0;
		while (t_start < end) {
			t_start = find_e820_area_size(t_start, &t_size, 1);

			/* done ? */
			if (t_start >= end)
				break;
			if (t_start + t_size > end)
				t_size = end - t_start;

484 485 486
			printk(KERN_CONT "\n  %016llx - %016llx pattern %d",
				(unsigned long long)t_start,
				(unsigned long long)t_start + t_size, pattern);
Y
Yinghai Lu 已提交
487 488 489 490 491 492

			memtest(t_start, t_size, pattern);

			t_start += t_size;
		}
	}
Y
Yinghai Lu 已提交
493
	printk(KERN_CONT "\n");
Y
Yinghai Lu 已提交
494
}
Y
Yinghai Lu 已提交
495 496 497 498 499
#else
static void __init early_memtest(unsigned long start, unsigned long end)
{
}
#endif
Y
Yinghai Lu 已提交
500

T
Thomas Gleixner 已提交
501 502 503 504 505
/*
 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
 * This runs before bootmem is initialized and gets pages directly from
 * the physical memory. To access them they are temporarily mapped.
 */
506
unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
T
Thomas Gleixner 已提交
507
{
508
	unsigned long next, last_map_addr = end;
Y
Yinghai Lu 已提交
509
	unsigned long start_phys = start, end_phys = end;
L
Linus Torvalds 已提交
510

Y
Yinghai Lu 已提交
511
	printk(KERN_INFO "init_memory_mapping\n");
L
Linus Torvalds 已提交
512

T
Thomas Gleixner 已提交
513
	/*
L
Linus Torvalds 已提交
514
	 * Find space for the kernel direct mapping tables.
T
Thomas Gleixner 已提交
515 516 517 518
	 *
	 * Later we should allocate these tables in the local node of the
	 * memory mapped. Unfortunately this is done currently before the
	 * nodes are discovered.
L
Linus Torvalds 已提交
519
	 */
520 521
	if (!after_bootmem) {
		init_gbpages();
522
		find_early_table_space(end);
523
	}
L
Linus Torvalds 已提交
524 525 526 527 528

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);

	for (; start < end; start = next) {
529
		pgd_t *pgd = pgd_offset_k(start);
T
Thomas Gleixner 已提交
530
		unsigned long pud_phys;
531 532 533
		pud_t *pud;

		if (after_bootmem)
534
			pud = pud_offset(pgd, start & PGDIR_MASK);
535
		else
536
			pud = alloc_low_page(&pud_phys);
537

L
Linus Torvalds 已提交
538
		next = start + PGDIR_SIZE;
T
Thomas Gleixner 已提交
539 540
		if (next > end)
			next = end;
541
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
542
		if (!after_bootmem)
543 544
			pgd_populate(&init_mm, pgd_offset_k(start),
				     __va(pud_phys));
545
		unmap_low_page(pud);
T
Thomas Gleixner 已提交
546
	}
L
Linus Torvalds 已提交
547

548
	if (!after_bootmem)
549
		mmu_cr4_features = read_cr4();
L
Linus Torvalds 已提交
550
	__flush_tlb_all();
551

552 553 554
	if (!after_bootmem)
		reserve_early(table_start << PAGE_SHIFT,
				 table_end << PAGE_SHIFT, "PGTABLE");
Y
Yinghai Lu 已提交
555 556 557

	if (!after_bootmem)
		early_memtest(start_phys, end_phys);
558

559
	return last_map_addr >> PAGE_SHIFT;
L
Linus Torvalds 已提交
560 561
}

562
#ifndef CONFIG_NUMA
563 564 565 566 567 568 569 570 571
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long bootmap_size, bootmap;

	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
				 PAGE_SIZE);
	if (bootmap == -1L)
		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
Y
Yinghai Lu 已提交
572 573 574
	/* don't touch min_low_pfn */
	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
					 0, end_pfn);
575 576 577 578 579 580
	e820_register_active_regions(0, start_pfn, end_pfn);
	free_bootmem_with_active_regions(0, end_pfn);
	early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
	reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
}

L
Linus Torvalds 已提交
581 582
void __init paging_init(void)
{
583
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
584

585 586 587 588 589
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
	max_zone_pfns[ZONE_NORMAL] = end_pfn;

590 591
	memory_present(0, 0, end_pfn);
	sparse_init();
592
	free_area_init_nodes(max_zone_pfns);
L
Linus Torvalds 已提交
593 594 595
}
#endif

596 597 598
/*
 * Memory hotplug specific functions
 */
599
#ifdef CONFIG_MEMORY_HOTPLUG
600 601 602 603
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
604
int arch_add_memory(int nid, u64 start, u64 size)
605
{
606
	struct pglist_data *pgdat = NODE_DATA(nid);
607
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
608
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
609 610 611
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

612 613 614
	last_mapped_pfn = init_memory_mapping(start, start + size-1);
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
615

616
	ret = __add_pages(zone, start_pfn, nr_pages);
617
	WARN_ON(1);
618 619 620

	return ret;
}
621
EXPORT_SYMBOL_GPL(arch_add_memory);
622

623
#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
624 625 626 627
int memory_add_physaddr_to_nid(u64 start)
{
	return 0;
}
628
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
629 630
#endif

631 632
#endif /* CONFIG_MEMORY_HOTPLUG */

633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652
/*
 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
 * is valid. The argument is a physical page number.
 *
 *
 * On x86, access has to be given to the first megabyte of ram because that area
 * contains bios code and data regions used by X and dosemu and similar apps.
 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
 * mmio resources as well as potential bios/acpi data regions.
 */
int devmem_is_allowed(unsigned long pagenr)
{
	if (pagenr <= 256)
		return 1;
	if (!page_is_ram(pagenr))
		return 1;
	return 0;
}


T
Thomas Gleixner 已提交
653 654
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
			 kcore_modules, kcore_vsyscall;
L
Linus Torvalds 已提交
655 656 657

void __init mem_init(void)
{
658
	long codesize, reservedpages, datasize, initsize;
L
Linus Torvalds 已提交
659

660
	pci_iommu_alloc();
L
Linus Torvalds 已提交
661

662
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
663 664 665 666

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
667
#ifdef CONFIG_NUMA
668
	totalram_pages = numa_free_all_bootmem();
L
Linus Torvalds 已提交
669
#else
670
	totalram_pages = free_all_bootmem();
L
Linus Torvalds 已提交
671
#endif
672 673
	reservedpages = end_pfn - totalram_pages -
					absent_pages_in_range(0, end_pfn);
L
Linus Torvalds 已提交
674 675 676 677 678 679 680
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
681 682
	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
L
Linus Torvalds 已提交
683 684 685
		   VMALLOC_END-VMALLOC_START);
	kclist_add(&kcore_kernel, &_stext, _end - _stext);
	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
T
Thomas Gleixner 已提交
686
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
L
Linus Torvalds 已提交
687 688
				 VSYSCALL_END - VSYSCALL_START);

689
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
T
Thomas Gleixner 已提交
690
				"%ldk reserved, %ldk data, %ldk init)\n",
L
Linus Torvalds 已提交
691 692 693 694 695 696
		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
		end_pfn << (PAGE_SHIFT-10),
		codesize >> 10,
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
697 698

	cpa_init();
L
Linus Torvalds 已提交
699 700
}

701
void free_init_pages(char *what, unsigned long begin, unsigned long end)
L
Linus Torvalds 已提交
702
{
703
	unsigned long addr = begin;
L
Linus Torvalds 已提交
704

705
	if (addr >= end)
706 707
		return;

I
Ingo Molnar 已提交
708 709 710 711 712 713 714 715 716 717
	/*
	 * If debugging page accesses then do not free this memory but
	 * mark them not present - any buggy init-section access will
	 * create a kernel page fault:
	 */
#ifdef CONFIG_DEBUG_PAGEALLOC
	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
		begin, PAGE_ALIGN(end));
	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
#else
718
	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
T
Thomas Gleixner 已提交
719

720
	for (; addr < end; addr += PAGE_SIZE) {
721 722 723 724 725
		ClearPageReserved(virt_to_page(addr));
		init_page_count(virt_to_page(addr));
		memset((void *)(addr & ~(PAGE_SIZE-1)),
			POISON_FREE_INITMEM, PAGE_SIZE);
		free_page(addr);
L
Linus Torvalds 已提交
726 727
		totalram_pages++;
	}
I
Ingo Molnar 已提交
728
#endif
729 730 731 732 733
}

void free_initmem(void)
{
	free_init_pages("unused kernel memory",
734 735
			(unsigned long)(&__init_begin),
			(unsigned long)(&__init_end));
L
Linus Torvalds 已提交
736 737
}

738
#ifdef CONFIG_DEBUG_RODATA
739 740
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
741 742 743

void mark_rodata_ro(void)
{
744
	unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
745

746
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
747
	       (end - start) >> 10);
748 749 750 751 752 753 754 755
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
	start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
	set_memory_nx(start, (end - start) >> PAGE_SHIFT);
756

757 758
	rodata_test();

759
#ifdef CONFIG_CPA_DEBUG
760
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
761
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
762

763
	printk(KERN_INFO "Testing CPA: again\n");
764
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
765
#endif
766
}
767

768 769
#endif

L
Linus Torvalds 已提交
770 771 772
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
773
	free_init_pages("initrd memory", start, end);
L
Linus Torvalds 已提交
774 775 776
}
#endif

777 778
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
				   int flags)
T
Thomas Gleixner 已提交
779
{
780
#ifdef CONFIG_NUMA
781
	int nid, next_nid;
782
	int ret;
783 784
#endif
	unsigned long pfn = phys >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
785

786
	if (pfn >= end_pfn) {
T
Thomas Gleixner 已提交
787 788 789 790
		/*
		 * This can happen with kdump kernels when accessing
		 * firmware tables:
		 */
791
		if (pfn < max_pfn_mapped)
792
			return -EFAULT;
T
Thomas Gleixner 已提交
793

794
		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
795
				phys, len);
796
		return -EFAULT;
797 798 799 800
	}

	/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
801 802 803
	nid = phys_to_nid(phys);
	next_nid = phys_to_nid(phys + len - 1);
	if (nid == next_nid)
804
		ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
805
	else
806 807 808 809 810
		ret = reserve_bootmem(phys, len, flags);

	if (ret != 0)
		return ret;

T
Thomas Gleixner 已提交
811
#else
812
	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
L
Linus Torvalds 已提交
813
#endif
814

815
	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
816
		dma_reserve += len / PAGE_SIZE;
817 818
		set_dma_reserve(dma_reserve);
	}
819 820

	return 0;
L
Linus Torvalds 已提交
821 822
}

T
Thomas Gleixner 已提交
823 824
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
825
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
826 827 828 829
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
830 831

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
832 833
		return 0;

L
Linus Torvalds 已提交
834 835 836 837 838 839
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
840
		return 0;
L
Linus Torvalds 已提交
841 842 843 844

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
845

L
Linus Torvalds 已提交
846 847 848 849 850 851
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
852

L
Linus Torvalds 已提交
853 854 855
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
856 857 858 859 860
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
861
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
862 863 864 865
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
866 867 868 869 870
};

struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
#ifdef CONFIG_IA32_EMULATION
871 872
	if (test_tsk_thread_flag(tsk, TIF_IA32))
		return NULL;
L
Linus Torvalds 已提交
873 874 875 876 877 878 879
#endif
	return &gate_vma;
}

int in_gate_area(struct task_struct *task, unsigned long addr)
{
	struct vm_area_struct *vma = get_gate_vma(task);
T
Thomas Gleixner 已提交
880

881 882
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
883

L
Linus Torvalds 已提交
884 885 886
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
887 888 889 890
/*
 * Use this when you have no reliable task/vma, typically from interrupt
 * context. It is less reliable than using the task's vma and may give
 * false positives:
L
Linus Torvalds 已提交
891 892 893
 */
int in_gate_area_no_task(unsigned long addr)
{
894
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
895
}
896

897 898 899 900 901 902 903 904
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
905 906 907 908 909

#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
910 911 912 913
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

T
Thomas Gleixner 已提交
914 915
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
916 917 918 919 920 921 922 923 924 925 926 927 928 929
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
		next = pmd_addr_end(addr, end);

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
930

931 932 933 934 935 936 937
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

		pmd = pmd_offset(pud, addr);
		if (pmd_none(*pmd)) {
			pte_t entry;
T
Thomas Gleixner 已提交
938 939 940
			void *p;

			p = vmemmap_alloc_block(PMD_SIZE, node);
941 942 943
			if (!p)
				return -ENOMEM;

T
Thomas Gleixner 已提交
944 945
			entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
							PAGE_KERNEL_LARGE);
946 947
			set_pmd(pmd, __pmd(pte_val(entry)));

948 949 950 951 952 953 954 955 956 957 958
			/* check to see if we have contiguous blocks */
			if (p_end != p || node_start != node) {
				if (p_start)
					printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						addr_start, addr_end-1, p_start, p_end-1, node_start);
				addr_start = addr;
				node_start = node;
				p_start = p;
			}
			addr_end = addr + PMD_SIZE;
			p_end = p + PMD_SIZE;
T
Thomas Gleixner 已提交
959
		} else {
960
			vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
961
		}
962 963 964
	}
	return 0;
}
965 966 967 968 969 970 971 972 973 974 975

void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
976
#endif