init_64.c 23.0 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
24
#include <linux/pci.h>
25
#include <linux/pfn.h>
26
#include <linux/poison.h>
27
#include <linux/dma-mapping.h>
28 29
#include <linux/module.h>
#include <linux/memory_hotplug.h>
30
#include <linux/nmi.h>
L
Linus Torvalds 已提交
31 32 33 34 35 36 37 38 39 40 41 42 43 44

#include <asm/processor.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
45
#include <asm/sections.h>
46
#include <asm/kdebug.h>
47
#include <asm/numa.h>
48
#include <asm/cacheflush.h>
L
Linus Torvalds 已提交
49

50 51
static unsigned long dma_reserve __initdata;

L
Linus Torvalds 已提交
52 53
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);

I
Ingo Molnar 已提交
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
int direct_gbpages __meminitdata
#ifdef CONFIG_DIRECT_GBPAGES
				= 1
#endif
;

static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
74 75 76 77 78 79 80 81
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

void show_mem(void)
{
82 83
	long i, total = 0, reserved = 0;
	long shared = 0, cached = 0;
L
Linus Torvalds 已提交
84
	struct page *page;
T
Thomas Gleixner 已提交
85
	pg_data_t *pgdat;
L
Linus Torvalds 已提交
86

87
	printk(KERN_INFO "Mem-info:\n");
L
Linus Torvalds 已提交
88
	show_free_areas();
89
	for_each_online_pgdat(pgdat) {
T
Thomas Gleixner 已提交
90 91 92 93 94 95
		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
			/*
			 * This loop can take a while with 256 GB and
			 * 4k pages so defer the NMI watchdog:
			 */
			if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
96
				touch_nmi_watchdog();
T
Thomas Gleixner 已提交
97

B
Bob Picco 已提交
98 99
			if (!pfn_valid(pgdat->node_start_pfn + i))
				continue;
T
Thomas Gleixner 已提交
100

L
Linus Torvalds 已提交
101 102
			page = pfn_to_page(pgdat->node_start_pfn + i);
			total++;
103 104 105 106 107 108
			if (PageReserved(page))
				reserved++;
			else if (PageSwapCache(page))
				cached++;
			else if (page_count(page))
				shared += page_count(page) - 1;
T
Thomas Gleixner 已提交
109
		}
L
Linus Torvalds 已提交
110
	}
T
Thomas Gleixner 已提交
111 112 113 114
	printk(KERN_INFO "%lu pages of RAM\n",		total);
	printk(KERN_INFO "%lu reserved pages\n",	reserved);
	printk(KERN_INFO "%lu pages shared\n",		shared);
	printk(KERN_INFO "%lu pages swap cached\n",	cached);
L
Linus Torvalds 已提交
115 116 117 118
}

int after_bootmem;

119
static __init void *spp_getpage(void)
T
Thomas Gleixner 已提交
120
{
L
Linus Torvalds 已提交
121
	void *ptr;
T
Thomas Gleixner 已提交
122

L
Linus Torvalds 已提交
123
	if (after_bootmem)
T
Thomas Gleixner 已提交
124
		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
L
Linus Torvalds 已提交
125 126
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
127 128 129 130 131

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
132

133
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
134

L
Linus Torvalds 已提交
135
	return ptr;
T
Thomas Gleixner 已提交
136
}
L
Linus Torvalds 已提交
137

I
Ingo Molnar 已提交
138
static void
T
Thomas Gleixner 已提交
139
set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
L
Linus Torvalds 已提交
140 141 142 143 144 145
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte, new_pte;

146
	pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
L
Linus Torvalds 已提交
147 148 149

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
150 151
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
L
Linus Torvalds 已提交
152 153 154 155
		return;
	}
	pud = pud_offset(pgd, vaddr);
	if (pud_none(*pud)) {
T
Thomas Gleixner 已提交
156
		pmd = (pmd_t *) spp_getpage();
L
Linus Torvalds 已提交
157 158
		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
		if (pmd != pmd_offset(pud, 0)) {
159
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
T
Thomas Gleixner 已提交
160
				pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
161 162 163 164 165 166 167 168
			return;
		}
	}
	pmd = pmd_offset(pud, vaddr);
	if (pmd_none(*pmd)) {
		pte = (pte_t *) spp_getpage();
		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
		if (pte != pte_offset_kernel(pmd, 0)) {
169
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
170 171 172 173 174 175
			return;
		}
	}
	new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);

	pte = pte_offset_kernel(pmd, vaddr);
I
Ingo Molnar 已提交
176
	if (!pte_none(*pte) && pte_val(new_pte) &&
L
Linus Torvalds 已提交
177 178 179 180 181 182 183 184 185 186 187
	    pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
		pte_ERROR(*pte);
	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

188
/*
189 190 191
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
 * We limit the mappings to the region from _text to _end.  _end is
 * rounded up to the 2MB boundary. This catches the invalid pmds as
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
	unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1;
	pmd_t *pmd = level2_kernel_pgt;
	pmd_t *last_pmd = pmd + PTRS_PER_PMD;

	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
209
		if (pmd_none(*pmd))
210 211 212 213 214 215
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

216 217
static unsigned long __initdata table_start;
static unsigned long __meminitdata table_end;
L
Linus Torvalds 已提交
218

219
static __meminit void *alloc_low_page(unsigned long *phys)
T
Thomas Gleixner 已提交
220
{
221
	unsigned long pfn = table_end++;
L
Linus Torvalds 已提交
222 223
	void *adr;

224 225 226
	if (after_bootmem) {
		adr = (void *)get_zeroed_page(GFP_ATOMIC);
		*phys = __pa(adr);
T
Thomas Gleixner 已提交
227

228 229 230
		return adr;
	}

T
Thomas Gleixner 已提交
231 232
	if (pfn >= end_pfn)
		panic("alloc_low_page: ran out of memory");
233 234

	adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
235
	memset(adr, 0, PAGE_SIZE);
236 237 238
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
L
Linus Torvalds 已提交
239

240
static __meminit void unmap_low_page(void *adr)
T
Thomas Gleixner 已提交
241
{
242 243 244
	if (after_bootmem)
		return;

245
	early_iounmap(adr, PAGE_SIZE);
T
Thomas Gleixner 已提交
246
}
L
Linus Torvalds 已提交
247

248
/* Must run before zap_low_mappings */
249
__meminit void *early_ioremap(unsigned long addr, unsigned long size)
250
{
251
	pmd_t *pmd, *last_pmd;
T
Thomas Gleixner 已提交
252
	unsigned long vaddr;
253 254 255 256 257 258
	int i, pmds;

	pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
	vaddr = __START_KERNEL_map;
	pmd = level2_kernel_pgt;
	last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
T
Thomas Gleixner 已提交
259

260 261 262
	for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
		for (i = 0; i < pmds; i++) {
			if (pmd_present(pmd[i]))
T
Thomas Gleixner 已提交
263
				goto continue_outer_loop;
264 265 266
		}
		vaddr += addr & ~PMD_MASK;
		addr &= PMD_MASK;
T
Thomas Gleixner 已提交
267

268
		for (i = 0; i < pmds; i++, addr += PMD_SIZE)
269
			set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
A
Andi Kleen 已提交
270
		__flush_tlb_all();
T
Thomas Gleixner 已提交
271

272
		return (void *)vaddr;
T
Thomas Gleixner 已提交
273
continue_outer_loop:
274
		;
275
	}
276
	printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size);
T
Thomas Gleixner 已提交
277

278
	return NULL;
279 280
}

T
Thomas Gleixner 已提交
281 282 283
/*
 * To avoid virtual aliases later:
 */
284
__meminit void early_iounmap(void *addr, unsigned long size)
285
{
286 287 288 289 290 291 292
	unsigned long vaddr;
	pmd_t *pmd;
	int i, pmds;

	vaddr = (unsigned long)addr;
	pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
	pmd = level2_kernel_pgt + pmd_index(vaddr);
T
Thomas Gleixner 已提交
293

294 295
	for (i = 0; i < pmds; i++)
		pmd_clear(pmd + i);
T
Thomas Gleixner 已提交
296

A
Andi Kleen 已提交
297
	__flush_tlb_all();
298 299
}

300
static unsigned long __meminit
301
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
302
{
303
	int i = pmd_index(address);
304

305 306
	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
		pmd_t *pmd = pmd_page + pmd_index(address);
307

308
		if (address >= end) {
T
Thomas Gleixner 已提交
309
			if (!after_bootmem) {
310 311
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
T
Thomas Gleixner 已提交
312
			}
313 314
			break;
		}
315 316 317 318

		if (pmd_val(*pmd))
			continue;

319 320
		set_pte((pte_t *)pmd,
			pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
321
	}
322
	return address;
323 324
}

325
static unsigned long __meminit
326 327
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
{
T
Thomas Gleixner 已提交
328
	pmd_t *pmd = pmd_offset(pud, 0);
329 330
	unsigned long last_map_addr;

331
	spin_lock(&init_mm.page_table_lock);
332
	last_map_addr = phys_pmd_init(pmd, address, end);
333 334
	spin_unlock(&init_mm.page_table_lock);
	__flush_tlb_all();
335
	return last_map_addr;
336 337
}

338
static unsigned long __meminit
T
Thomas Gleixner 已提交
339 340
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
{
341
	unsigned long last_map_addr = end;
342
	int i = pud_index(addr);
343

T
Thomas Gleixner 已提交
344
	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
345 346
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
347 348
		pmd_t *pmd;

349
		if (addr >= end)
L
Linus Torvalds 已提交
350 351
			break;

T
Thomas Gleixner 已提交
352 353 354
		if (!after_bootmem &&
				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
			set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
355
			continue;
T
Thomas Gleixner 已提交
356
		}
L
Linus Torvalds 已提交
357

358
		if (pud_val(*pud)) {
359
			if (!pud_large(*pud))
360
				last_map_addr = phys_pmd_update(pud, addr, end);
361 362 363 364 365 366
			continue;
		}

		if (direct_gbpages) {
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
367
			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
368 369 370
			continue;
		}

371
		pmd = alloc_low_page(&pmd_phys);
T
Thomas Gleixner 已提交
372

373
		spin_lock(&init_mm.page_table_lock);
L
Linus Torvalds 已提交
374
		set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
375
		last_map_addr = phys_pmd_init(pmd, addr, end);
376
		spin_unlock(&init_mm.page_table_lock);
T
Thomas Gleixner 已提交
377

378
		unmap_low_page(pmd);
L
Linus Torvalds 已提交
379
	}
A
Andi Kleen 已提交
380
	__flush_tlb_all();
381 382

	return last_map_addr >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
383
}
L
Linus Torvalds 已提交
384 385 386

static void __init find_early_table_space(unsigned long end)
{
387
	unsigned long puds, pmds, tables, start;
L
Linus Torvalds 已提交
388 389

	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
390 391 392 393 394
	tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
	if (!direct_gbpages) {
		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
		tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
	}
L
Linus Torvalds 已提交
395

T
Thomas Gleixner 已提交
396 397 398 399 400 401
	/*
	 * RED-PEN putting page tables only on node 0 could
	 * cause a hotspot and fill up ZONE_DMA. The page tables
	 * need roughly 0.5KB per GB.
	 */
	start = 0x8000;
402
	table_start = find_e820_area(start, end, tables, PAGE_SIZE);
L
Linus Torvalds 已提交
403 404 405 406 407
	if (table_start == -1UL)
		panic("Cannot find space for the kernel page tables");

	table_start >>= PAGE_SHIFT;
	table_end = table_start;
408 409

	early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
410 411
		end, table_start << PAGE_SHIFT,
		(table_start << PAGE_SHIFT) + tables);
L
Linus Torvalds 已提交
412 413
}

414 415 416 417 418 419 420 421
static void __init init_gbpages(void)
{
	if (direct_gbpages && cpu_has_gbpages)
		printk(KERN_INFO "Using GB pages for direct mapping\n");
	else
		direct_gbpages = 0;
}

Y
Yinghai Lu 已提交
422 423 424 425
#ifdef CONFIG_MEMTEST_BOOTPARAM

static void __init memtest(unsigned long start_phys, unsigned long size,
				 unsigned pattern)
Y
Yinghai Lu 已提交
426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467
{
	unsigned long i;
	unsigned long *start;
	unsigned long start_bad;
	unsigned long last_bad;
	unsigned long val;
	unsigned long start_phys_aligned;
	unsigned long count;
	unsigned long incr;

	switch (pattern) {
	case 0:
		val = 0UL;
		break;
	case 1:
		val = -1UL;
		break;
	case 2:
		val = 0x5555555555555555UL;
		break;
	case 3:
		val = 0xaaaaaaaaaaaaaaaaUL;
		break;
	default:
		return;
	}

	incr = sizeof(unsigned long);
	start_phys_aligned = ALIGN(start_phys, incr);
	count = (size - (start_phys_aligned - start_phys))/incr;
	start = __va(start_phys_aligned);
	start_bad = 0;
	last_bad = 0;

	for (i = 0; i < count; i++)
		start[i] = val;
	for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
		if (*start != val) {
			if (start_phys_aligned == last_bad + incr) {
				last_bad += incr;
			} else {
				if (start_bad) {
Y
Yinghai Lu 已提交
468
					printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
Y
Yinghai Lu 已提交
469 470 471 472 473 474 475 476
						val, start_bad, last_bad + incr);
					reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
				}
				start_bad = last_bad = start_phys_aligned;
			}
		}
	}
	if (start_bad) {
Y
Yinghai Lu 已提交
477
		printk(KERN_CONT "\n  %016lx bad mem addr %016lx - %016lx reserved",
Y
Yinghai Lu 已提交
478 479 480 481 482 483
			val, start_bad, last_bad + incr);
		reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
	}

}

Y
Yinghai Lu 已提交
484 485
static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;

Y
Yinghai Lu 已提交
486 487 488
static int __init parse_memtest(char *arg)
{
	if (arg)
Y
Yinghai Lu 已提交
489
		memtest_pattern = simple_strtoul(arg, NULL, 0);
Y
Yinghai Lu 已提交
490 491 492 493 494 495 496
	return 0;
}

early_param("memtest", parse_memtest);

static void __init early_memtest(unsigned long start, unsigned long end)
{
497
	u64 t_start, t_size;
Y
Yinghai Lu 已提交
498 499
	unsigned pattern;

Y
Yinghai Lu 已提交
500 501 502 503
	if (!memtest_pattern)
		return;

	printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
Y
Yinghai Lu 已提交
504 505 506 507 508 509 510 511 512 513 514 515
	for (pattern = 0; pattern < memtest_pattern; pattern++) {
		t_start = start;
		t_size = 0;
		while (t_start < end) {
			t_start = find_e820_area_size(t_start, &t_size, 1);

			/* done ? */
			if (t_start >= end)
				break;
			if (t_start + t_size > end)
				t_size = end - t_start;

516
			printk(KERN_CONT "\n  %016llx - %016llx pattern %d",
Y
Yinghai Lu 已提交
517 518 519 520 521 522 523
				t_start, t_start + t_size, pattern);

			memtest(t_start, t_size, pattern);

			t_start += t_size;
		}
	}
Y
Yinghai Lu 已提交
524
	printk(KERN_CONT "\n");
Y
Yinghai Lu 已提交
525
}
Y
Yinghai Lu 已提交
526 527 528 529 530
#else
static void __init early_memtest(unsigned long start, unsigned long end)
{
}
#endif
Y
Yinghai Lu 已提交
531

T
Thomas Gleixner 已提交
532 533 534 535 536
/*
 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
 * This runs before bootmem is initialized and gets pages directly from
 * the physical memory. To access them they are temporarily mapped.
 */
537
unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
T
Thomas Gleixner 已提交
538
{
539
	unsigned long next, last_map_addr = end;
Y
Yinghai Lu 已提交
540
	unsigned long start_phys = start, end_phys = end;
L
Linus Torvalds 已提交
541

Y
Yinghai Lu 已提交
542
	printk(KERN_INFO "init_memory_mapping\n");
L
Linus Torvalds 已提交
543

T
Thomas Gleixner 已提交
544
	/*
L
Linus Torvalds 已提交
545
	 * Find space for the kernel direct mapping tables.
T
Thomas Gleixner 已提交
546 547 548 549
	 *
	 * Later we should allocate these tables in the local node of the
	 * memory mapped. Unfortunately this is done currently before the
	 * nodes are discovered.
L
Linus Torvalds 已提交
550
	 */
551 552
	if (!after_bootmem) {
		init_gbpages();
553
		find_early_table_space(end);
554
	}
L
Linus Torvalds 已提交
555 556 557 558 559

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);

	for (; start < end; start = next) {
560
		pgd_t *pgd = pgd_offset_k(start);
T
Thomas Gleixner 已提交
561
		unsigned long pud_phys;
562 563 564
		pud_t *pud;

		if (after_bootmem)
565
			pud = pud_offset(pgd, start & PGDIR_MASK);
566
		else
567
			pud = alloc_low_page(&pud_phys);
568

L
Linus Torvalds 已提交
569
		next = start + PGDIR_SIZE;
T
Thomas Gleixner 已提交
570 571
		if (next > end)
			next = end;
572
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
573 574
		if (!after_bootmem)
			set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
575
		unmap_low_page(pud);
T
Thomas Gleixner 已提交
576
	}
L
Linus Torvalds 已提交
577

578
	if (!after_bootmem)
579
		mmu_cr4_features = read_cr4();
L
Linus Torvalds 已提交
580
	__flush_tlb_all();
581

582 583 584
	if (!after_bootmem)
		reserve_early(table_start << PAGE_SHIFT,
				 table_end << PAGE_SHIFT, "PGTABLE");
Y
Yinghai Lu 已提交
585 586 587

	if (!after_bootmem)
		early_memtest(start_phys, end_phys);
588 589

	return last_map_addr;
L
Linus Torvalds 已提交
590 591
}

592
#ifndef CONFIG_NUMA
L
Linus Torvalds 已提交
593 594
void __init paging_init(void)
{
595
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
596

597 598 599 600 601
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
	max_zone_pfns[ZONE_NORMAL] = end_pfn;

602 603
	memory_present(0, 0, end_pfn);
	sparse_init();
604
	free_area_init_nodes(max_zone_pfns);
L
Linus Torvalds 已提交
605 606 607
}
#endif

608 609 610
/*
 * Memory hotplug specific functions
 */
611
#ifdef CONFIG_MEMORY_HOTPLUG
612 613 614 615
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
616
int arch_add_memory(int nid, u64 start, u64 size)
617
{
618
	struct pglist_data *pgdat = NODE_DATA(nid);
619
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
620
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
621 622 623
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

624 625 626
	last_mapped_pfn = init_memory_mapping(start, start + size-1);
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
627

628
	ret = __add_pages(zone, start_pfn, nr_pages);
629
	WARN_ON(1);
630 631 632

	return ret;
}
633
EXPORT_SYMBOL_GPL(arch_add_memory);
634

635
#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
636 637 638 639
int memory_add_physaddr_to_nid(u64 start)
{
	return 0;
}
640
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
641 642
#endif

643 644
#endif /* CONFIG_MEMORY_HOTPLUG */

645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664
/*
 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
 * is valid. The argument is a physical page number.
 *
 *
 * On x86, access has to be given to the first megabyte of ram because that area
 * contains bios code and data regions used by X and dosemu and similar apps.
 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
 * mmio resources as well as potential bios/acpi data regions.
 */
int devmem_is_allowed(unsigned long pagenr)
{
	if (pagenr <= 256)
		return 1;
	if (!page_is_ram(pagenr))
		return 1;
	return 0;
}


T
Thomas Gleixner 已提交
665 666
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
			 kcore_modules, kcore_vsyscall;
L
Linus Torvalds 已提交
667 668 669

void __init mem_init(void)
{
670
	long codesize, reservedpages, datasize, initsize;
L
Linus Torvalds 已提交
671

672
	pci_iommu_alloc();
L
Linus Torvalds 已提交
673

674
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
675 676 677 678

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
679
#ifdef CONFIG_NUMA
680
	totalram_pages = numa_free_all_bootmem();
L
Linus Torvalds 已提交
681
#else
682
	totalram_pages = free_all_bootmem();
L
Linus Torvalds 已提交
683
#endif
684 685
	reservedpages = end_pfn - totalram_pages -
					absent_pages_in_range(0, end_pfn);
L
Linus Torvalds 已提交
686 687 688 689 690 691 692
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
693 694
	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
L
Linus Torvalds 已提交
695 696 697
		   VMALLOC_END-VMALLOC_START);
	kclist_add(&kcore_kernel, &_stext, _end - _stext);
	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
T
Thomas Gleixner 已提交
698
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
L
Linus Torvalds 已提交
699 700
				 VSYSCALL_END - VSYSCALL_START);

701
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
T
Thomas Gleixner 已提交
702
				"%ldk reserved, %ldk data, %ldk init)\n",
L
Linus Torvalds 已提交
703 704 705 706 707 708
		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
		end_pfn << (PAGE_SHIFT-10),
		codesize >> 10,
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
709 710

	cpa_init();
L
Linus Torvalds 已提交
711 712
}

713
void free_init_pages(char *what, unsigned long begin, unsigned long end)
L
Linus Torvalds 已提交
714
{
715
	unsigned long addr = begin;
L
Linus Torvalds 已提交
716

717
	if (addr >= end)
718 719
		return;

I
Ingo Molnar 已提交
720 721 722 723 724 725 726 727 728 729
	/*
	 * If debugging page accesses then do not free this memory but
	 * mark them not present - any buggy init-section access will
	 * create a kernel page fault:
	 */
#ifdef CONFIG_DEBUG_PAGEALLOC
	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
		begin, PAGE_ALIGN(end));
	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
#else
730
	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
T
Thomas Gleixner 已提交
731

732
	for (; addr < end; addr += PAGE_SIZE) {
733 734 735 736 737
		ClearPageReserved(virt_to_page(addr));
		init_page_count(virt_to_page(addr));
		memset((void *)(addr & ~(PAGE_SIZE-1)),
			POISON_FREE_INITMEM, PAGE_SIZE);
		free_page(addr);
L
Linus Torvalds 已提交
738 739
		totalram_pages++;
	}
I
Ingo Molnar 已提交
740
#endif
741 742 743 744 745
}

void free_initmem(void)
{
	free_init_pages("unused kernel memory",
746 747
			(unsigned long)(&__init_begin),
			(unsigned long)(&__init_end));
L
Linus Torvalds 已提交
748 749
}

750
#ifdef CONFIG_DEBUG_RODATA
751 752
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
753 754 755

void mark_rodata_ro(void)
{
756
	unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
757

758
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
759
	       (end - start) >> 10);
760 761 762 763 764 765 766 767
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
	start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
	set_memory_nx(start, (end - start) >> PAGE_SHIFT);
768

769 770
	rodata_test();

771
#ifdef CONFIG_CPA_DEBUG
772
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
773
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
774

775
	printk(KERN_INFO "Testing CPA: again\n");
776
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
777
#endif
778
}
779

780 781
#endif

L
Linus Torvalds 已提交
782 783 784
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
785
	free_init_pages("initrd memory", start, end);
L
Linus Torvalds 已提交
786 787 788
}
#endif

T
Thomas Gleixner 已提交
789 790
void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
{
791
#ifdef CONFIG_NUMA
792
	int nid, next_nid;
793 794
#endif
	unsigned long pfn = phys >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
795

796
	if (pfn >= end_pfn) {
T
Thomas Gleixner 已提交
797 798 799 800
		/*
		 * This can happen with kdump kernels when accessing
		 * firmware tables:
		 */
801
		if (pfn < max_pfn_mapped)
802
			return;
T
Thomas Gleixner 已提交
803

804 805 806 807 808 809 810
		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
				phys, len);
		return;
	}

	/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
811 812 813 814 815 816
	nid = phys_to_nid(phys);
	next_nid = phys_to_nid(phys + len - 1);
	if (nid == next_nid)
		reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
	else
		reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
T
Thomas Gleixner 已提交
817
#else
818
	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
L
Linus Torvalds 已提交
819
#endif
820

821
	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
822
		dma_reserve += len / PAGE_SIZE;
823 824
		set_dma_reserve(dma_reserve);
	}
L
Linus Torvalds 已提交
825 826
}

T
Thomas Gleixner 已提交
827 828
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
829
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
830 831 832 833
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
834 835

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
836 837
		return 0;

L
Linus Torvalds 已提交
838 839 840 841 842 843
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
844
		return 0;
L
Linus Torvalds 已提交
845 846 847 848

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
849

L
Linus Torvalds 已提交
850 851 852 853 854 855
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
856

L
Linus Torvalds 已提交
857 858 859
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
860 861 862 863 864
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
865
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
866 867 868 869
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
870 871 872 873 874
};

struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
#ifdef CONFIG_IA32_EMULATION
875 876
	if (test_tsk_thread_flag(tsk, TIF_IA32))
		return NULL;
L
Linus Torvalds 已提交
877 878 879 880 881 882 883
#endif
	return &gate_vma;
}

int in_gate_area(struct task_struct *task, unsigned long addr)
{
	struct vm_area_struct *vma = get_gate_vma(task);
T
Thomas Gleixner 已提交
884

885 886
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
887

L
Linus Torvalds 已提交
888 889 890
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
891 892 893 894
/*
 * Use this when you have no reliable task/vma, typically from interrupt
 * context. It is less reliable than using the task's vma and may give
 * false positives:
L
Linus Torvalds 已提交
895 896 897
 */
int in_gate_area_no_task(unsigned long addr)
{
898
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
899
}
900

901 902 903 904 905 906 907 908
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
909 910 911 912 913

#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
914 915 916 917
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

T
Thomas Gleixner 已提交
918 919
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
920 921 922 923 924 925 926 927 928 929 930 931 932 933
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
		next = pmd_addr_end(addr, end);

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
934

935 936 937 938 939 940 941
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

		pmd = pmd_offset(pud, addr);
		if (pmd_none(*pmd)) {
			pte_t entry;
T
Thomas Gleixner 已提交
942 943 944
			void *p;

			p = vmemmap_alloc_block(PMD_SIZE, node);
945 946 947
			if (!p)
				return -ENOMEM;

T
Thomas Gleixner 已提交
948 949
			entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
							PAGE_KERNEL_LARGE);
950 951
			set_pmd(pmd, __pmd(pte_val(entry)));

952 953 954 955 956 957 958 959 960 961 962
			/* check to see if we have contiguous blocks */
			if (p_end != p || node_start != node) {
				if (p_start)
					printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						addr_start, addr_end-1, p_start, p_end-1, node_start);
				addr_start = addr;
				node_start = node;
				p_start = p;
			}
			addr_end = addr + PMD_SIZE;
			p_end = p + PMD_SIZE;
T
Thomas Gleixner 已提交
963
		} else {
964
			vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
965
		}
966 967 968
	}
	return 0;
}
969 970 971 972 973 974 975 976 977 978 979

void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
980
#endif