init_64.c 31.4 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
T
Thomas Gleixner 已提交
21
#include <linux/initrd.h>
L
Linus Torvalds 已提交
22 23 24
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
25
#include <linux/pci.h>
26
#include <linux/pfn.h>
27
#include <linux/poison.h>
28
#include <linux/dma-mapping.h>
29 30
#include <linux/module.h>
#include <linux/memory_hotplug.h>
31
#include <linux/nmi.h>
L
Linus Torvalds 已提交
32 33

#include <asm/processor.h>
34
#include <asm/bios_ebda.h>
L
Linus Torvalds 已提交
35 36 37 38 39 40 41 42 43 44 45 46
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
47
#include <asm/sections.h>
48
#include <asm/kdebug.h>
49
#include <asm/numa.h>
50
#include <asm/cacheflush.h>
L
Linus Torvalds 已提交
51

52 53 54 55 56
/*
 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
 * The direct mapping extends to max_pfn_mapped, so that we can directly access
 * apertures, ACPI and other tables without having to play with fixmaps.
 */
57
unsigned long max_low_pfn_mapped;
58 59
unsigned long max_pfn_mapped;

60 61
static unsigned long dma_reserve __initdata;

L
Linus Torvalds 已提交
62 63
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);

H
Hugh Dickins 已提交
64
int direct_gbpages
I
Ingo Molnar 已提交
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
#ifdef CONFIG_DIRECT_GBPAGES
				= 1
#endif
;

static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
84 85 86 87 88 89 90 91
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

int after_bootmem;

92
pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
93 94 95 96
EXPORT_SYMBOL_GPL(__supported_pte_mask);

static int do_not_nx __cpuinitdata;

I
Ingo Molnar 已提交
97 98 99 100 101 102 103
/*
 * noexec=on|off
 * Control non-executable mappings for 64-bit processes.
 *
 * on	Enable (default)
 * off	Disable
 */
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
static int __init nonx_setup(char *str)
{
	if (!str)
		return -EINVAL;
	if (!strncmp(str, "on", 2)) {
		__supported_pte_mask |= _PAGE_NX;
		do_not_nx = 0;
	} else if (!strncmp(str, "off", 3)) {
		do_not_nx = 1;
		__supported_pte_mask &= ~_PAGE_NX;
	}
	return 0;
}
early_param("noexec", nonx_setup);

void __cpuinit check_efer(void)
{
	unsigned long efer;

	rdmsrl(MSR_EFER, efer);
	if (!(efer & EFER_NX) || do_not_nx)
		__supported_pte_mask &= ~_PAGE_NX;
}

int force_personality32;

I
Ingo Molnar 已提交
130 131 132 133 134 135 136 137
/*
 * noexec32=on|off
 * Control non executable heap for 32bit processes.
 * To control the stack too use noexec=off
 *
 * on	PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
 * off	PROT_READ implies PROT_EXEC
 */
138 139 140 141 142 143 144 145 146 147
static int __init nonx32_setup(char *str)
{
	if (!strcmp(str, "on"))
		force_personality32 &= ~READ_IMPLIES_EXEC;
	else if (!strcmp(str, "off"))
		force_personality32 |= READ_IMPLIES_EXEC;
	return 1;
}
__setup("noexec32=", nonx32_setup);

148 149 150 151 152
/*
 * NOTE: This function is marked __ref because it calls __init function
 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
 */
static __ref void *spp_getpage(void)
T
Thomas Gleixner 已提交
153
{
L
Linus Torvalds 已提交
154
	void *ptr;
T
Thomas Gleixner 已提交
155

L
Linus Torvalds 已提交
156
	if (after_bootmem)
T
Thomas Gleixner 已提交
157
		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
L
Linus Torvalds 已提交
158 159
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
160 161 162 163 164

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
165

166
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
167

L
Linus Torvalds 已提交
168
	return ptr;
T
Thomas Gleixner 已提交
169
}
L
Linus Torvalds 已提交
170

171
void
172
set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
L
Linus Torvalds 已提交
173 174 175
{
	pud_t *pud;
	pmd_t *pmd;
176
	pte_t *pte;
L
Linus Torvalds 已提交
177

178
	pud = pud_page + pud_index(vaddr);
L
Linus Torvalds 已提交
179
	if (pud_none(*pud)) {
T
Thomas Gleixner 已提交
180
		pmd = (pmd_t *) spp_getpage();
181
		pud_populate(&init_mm, pud, pmd);
L
Linus Torvalds 已提交
182
		if (pmd != pmd_offset(pud, 0)) {
183
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
T
Thomas Gleixner 已提交
184
				pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
185 186 187 188 189 190
			return;
		}
	}
	pmd = pmd_offset(pud, vaddr);
	if (pmd_none(*pmd)) {
		pte = (pte_t *) spp_getpage();
191
		pmd_populate_kernel(&init_mm, pmd, pte);
L
Linus Torvalds 已提交
192
		if (pte != pte_offset_kernel(pmd, 0)) {
193
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
194 195 196 197 198 199 200 201 202 203 204 205 206 207
			return;
		}
	}

	pte = pte_offset_kernel(pmd, vaddr);
	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
void
set_pte_vaddr(unsigned long vaddr, pte_t pteval)
{
	pgd_t *pgd;
	pud_t *pud_page;

	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
		return;
	}
	pud_page = (pud_t*)pgd_page_vaddr(*pgd);
	set_pte_vaddr_pud(pud_page, vaddr, pteval);
}

226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265
/*
 * Create large page table mappings for a range of physical addresses.
 */
static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
						pgprot_t prot)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
		pgd = pgd_offset_k((unsigned long)__va(phys));
		if (pgd_none(*pgd)) {
			pud = (pud_t *) spp_getpage();
			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pud = pud_offset(pgd, (unsigned long)__va(phys));
		if (pud_none(*pud)) {
			pmd = (pmd_t *) spp_getpage();
			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pmd = pmd_offset(pud, phys);
		BUG_ON(!pmd_none(*pmd));
		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
	}
}

void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
}

void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
}

266
/*
267 268 269
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
270 271 272 273 274 275 276 277 278 279 280 281
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
 * We limit the mappings to the region from _text to _end.  _end is
 * rounded up to the 2MB boundary. This catches the invalid pmds as
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
282
	unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
283 284 285 286
	pmd_t *pmd = level2_kernel_pgt;
	pmd_t *last_pmd = pmd + PTRS_PER_PMD;

	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
287
		if (pmd_none(*pmd))
288 289 290 291 292 293
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

294 295
static unsigned long __initdata table_start;
static unsigned long __meminitdata table_end;
296
static unsigned long __meminitdata table_top;
L
Linus Torvalds 已提交
297

298
static __ref void *alloc_low_page(unsigned long *phys)
T
Thomas Gleixner 已提交
299
{
300
	unsigned long pfn = table_end++;
L
Linus Torvalds 已提交
301 302
	void *adr;

303 304 305
	if (after_bootmem) {
		adr = (void *)get_zeroed_page(GFP_ATOMIC);
		*phys = __pa(adr);
T
Thomas Gleixner 已提交
306

307 308 309
		return adr;
	}

310
	if (pfn >= table_top)
T
Thomas Gleixner 已提交
311
		panic("alloc_low_page: ran out of memory");
312

J
Jeremy Fitzhardinge 已提交
313
	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
314
	memset(adr, 0, PAGE_SIZE);
315 316 317
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
L
Linus Torvalds 已提交
318

319
static __ref void unmap_low_page(void *adr)
T
Thomas Gleixner 已提交
320
{
321 322 323
	if (after_bootmem)
		return;

324
	early_iounmap(adr, PAGE_SIZE);
T
Thomas Gleixner 已提交
325
}
L
Linus Torvalds 已提交
326

327
static unsigned long __meminit
328 329
phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
	      pgprot_t prot)
330 331
{
	unsigned pages = 0;
332
	unsigned long last_map_addr = end;
333
	int i;
334

335 336 337 338 339 340 341 342 343 344 345 346
	pte_t *pte = pte_page + pte_index(addr);

	for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {

		if (addr >= end) {
			if (!after_bootmem) {
				for(; i < PTRS_PER_PTE; i++, pte++)
					set_pte(pte, __pte(0));
			}
			break;
		}

347 348 349 350 351 352
		/*
		 * We will re-use the existing mapping.
		 * Xen for example has some special requirements, like mapping
		 * pagetable pages as RO. So assume someone who pre-setup
		 * these mappings are more intelligent.
		 */
353 354
		if (pte_val(*pte)) {
			pages++;
355
			continue;
356
		}
357 358 359 360 361

		if (0)
			printk("   pte=%p addr=%lx pte=%016lx\n",
			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
		pages++;
362
		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
363
		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
364
	}
365

366
	update_page_count(PG_LEVEL_4K, pages);
367 368

	return last_map_addr;
369 370
}

371
static unsigned long __meminit
372 373
phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
		pgprot_t prot)
374 375 376
{
	pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);

377
	return phys_pte_init(pte, address, end, prot);
378 379
}

380
static unsigned long __meminit
381
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
382
	      unsigned long page_size_mask, pgprot_t prot)
383
{
384
	unsigned long pages = 0;
385
	unsigned long last_map_addr = end;
386

387
	int i = pmd_index(address);
388

389
	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
390
		unsigned long pte_phys;
391
		pmd_t *pmd = pmd_page + pmd_index(address);
392
		pte_t *pte;
393
		pgprot_t new_prot = prot;
394

395
		if (address >= end) {
T
Thomas Gleixner 已提交
396
			if (!after_bootmem) {
397 398
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
T
Thomas Gleixner 已提交
399
			}
400 401
			break;
		}
402

403
		if (pmd_val(*pmd)) {
404 405
			if (!pmd_large(*pmd)) {
				spin_lock(&init_mm.page_table_lock);
406
				last_map_addr = phys_pte_update(pmd, address,
407
								end, prot);
408
				spin_unlock(&init_mm.page_table_lock);
409
				continue;
410
			}
411 412 413 414 415 416 417 418 419 420 421 422
			/*
			 * If we are ok with PG_LEVEL_2M mapping, then we will
			 * use the existing mapping,
			 *
			 * Otherwise, we will split the large page mapping but
			 * use the same existing protection bits except for
			 * large page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
423 424
			if (page_size_mask & (1 << PG_LEVEL_2M)) {
				pages++;
425
				continue;
426
			}
427
			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
428 429
		}

430
		if (page_size_mask & (1<<PG_LEVEL_2M)) {
431
			pages++;
432
			spin_lock(&init_mm.page_table_lock);
433
			set_pte((pte_t *)pmd,
434 435
				pfn_pte(address >> PAGE_SHIFT,
					__pgprot(pgprot_val(prot) | _PAGE_PSE)));
436
			spin_unlock(&init_mm.page_table_lock);
437
			last_map_addr = (address & PMD_MASK) + PMD_SIZE;
438
			continue;
439
		}
440

441
		pte = alloc_low_page(&pte_phys);
442
		last_map_addr = phys_pte_init(pte, address, end, new_prot);
443 444
		unmap_low_page(pte);

445
		spin_lock(&init_mm.page_table_lock);
446
		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
447
		spin_unlock(&init_mm.page_table_lock);
448
	}
449
	update_page_count(PG_LEVEL_2M, pages);
450
	return last_map_addr;
451 452
}

453
static unsigned long __meminit
454
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
455
		unsigned long page_size_mask, pgprot_t prot)
456
{
T
Thomas Gleixner 已提交
457
	pmd_t *pmd = pmd_offset(pud, 0);
458 459
	unsigned long last_map_addr;

460
	last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
461
	__flush_tlb_all();
462
	return last_map_addr;
463 464
}

465
static unsigned long __meminit
466 467
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
			 unsigned long page_size_mask)
T
Thomas Gleixner 已提交
468
{
469
	unsigned long pages = 0;
470
	unsigned long last_map_addr = end;
471
	int i = pud_index(addr);
472

T
Thomas Gleixner 已提交
473
	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
474 475
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
476
		pmd_t *pmd;
477
		pgprot_t prot = PAGE_KERNEL;
L
Linus Torvalds 已提交
478

479
		if (addr >= end)
L
Linus Torvalds 已提交
480 481
			break;

T
Thomas Gleixner 已提交
482 483 484
		if (!after_bootmem &&
				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
			set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
485
			continue;
T
Thomas Gleixner 已提交
486
		}
L
Linus Torvalds 已提交
487

488
		if (pud_val(*pud)) {
489
			if (!pud_large(*pud)) {
490
				last_map_addr = phys_pmd_update(pud, addr, end,
491
							 page_size_mask, prot);
492 493
				continue;
			}
494 495 496 497 498 499 500 501 502 503 504 505
			/*
			 * If we are ok with PG_LEVEL_1G mapping, then we will
			 * use the existing mapping.
			 *
			 * Otherwise, we will split the gbpage mapping but use
			 * the same existing protection  bits except for large
			 * page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
506 507
			if (page_size_mask & (1 << PG_LEVEL_1G)) {
				pages++;
508
				continue;
509
			}
510
			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
511 512
		}

513
		if (page_size_mask & (1<<PG_LEVEL_1G)) {
514
			pages++;
515
			spin_lock(&init_mm.page_table_lock);
516 517
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
518
			spin_unlock(&init_mm.page_table_lock);
519
			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
520 521 522
			continue;
		}

523
		pmd = alloc_low_page(&pmd_phys);
524 525
		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
					      prot);
526
		unmap_low_page(pmd);
527 528

		spin_lock(&init_mm.page_table_lock);
529
		pud_populate(&init_mm, pud, __va(pmd_phys));
530
		spin_unlock(&init_mm.page_table_lock);
L
Linus Torvalds 已提交
531
	}
A
Andi Kleen 已提交
532
	__flush_tlb_all();
533

534
	update_page_count(PG_LEVEL_1G, pages);
535

536
	return last_map_addr;
T
Thomas Gleixner 已提交
537
}
L
Linus Torvalds 已提交
538

539
static unsigned long __meminit
540 541
phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
		 unsigned long page_size_mask)
542 543 544 545 546
{
	pud_t *pud;

	pud = (pud_t *)pgd_page_vaddr(*pgd);

547
	return phys_pud_init(pud, addr, end, page_size_mask);
548 549
}

550 551
static void __init find_early_table_space(unsigned long end, int use_pse,
					  int use_gbpages)
L
Linus Torvalds 已提交
552
{
553
	unsigned long puds, pmds, ptes, tables, start;
L
Linus Torvalds 已提交
554 555

	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
556
	tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
557
	if (use_gbpages) {
558 559 560 561 562
		unsigned long extra;
		extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
		pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
	} else
		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
563
	tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
564

565
	if (use_pse) {
566 567 568 569 570
		unsigned long extra;
		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
	} else
		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
571
	tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
L
Linus Torvalds 已提交
572

T
Thomas Gleixner 已提交
573 574 575 576 577 578
	/*
	 * RED-PEN putting page tables only on node 0 could
	 * cause a hotspot and fill up ZONE_DMA. The page tables
	 * need roughly 0.5KB per GB.
	 */
	start = 0x8000;
579
	table_start = find_e820_area(start, end, tables, PAGE_SIZE);
L
Linus Torvalds 已提交
580 581 582 583 584
	if (table_start == -1UL)
		panic("Cannot find space for the kernel page tables");

	table_start >>= PAGE_SHIFT;
	table_end = table_start;
585
	table_top = table_start + (tables >> PAGE_SHIFT);
586

587 588
	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
		end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
L
Linus Torvalds 已提交
589 590
}

591 592 593 594 595 596 597 598
static void __init init_gbpages(void)
{
	if (direct_gbpages && cpu_has_gbpages)
		printk(KERN_INFO "Using GB pages for direct mapping\n");
	else
		direct_gbpages = 0;
}

599
static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
600 601
						unsigned long end,
						unsigned long page_size_mask)
T
Thomas Gleixner 已提交
602
{
L
Linus Torvalds 已提交
603

604
	unsigned long next, last_map_addr = end;
L
Linus Torvalds 已提交
605 606 607 608 609

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);

	for (; start < end; start = next) {
610
		pgd_t *pgd = pgd_offset_k(start);
T
Thomas Gleixner 已提交
611
		unsigned long pud_phys;
612 613
		pud_t *pud;

614
		next = (start + PGDIR_SIZE) & PGDIR_MASK;
615 616 617 618
		if (next > end)
			next = end;

		if (pgd_val(*pgd)) {
619 620
			last_map_addr = phys_pud_update(pgd, __pa(start),
						 __pa(end), page_size_mask);
621 622 623
			continue;
		}

624
		pud = alloc_low_page(&pud_phys);
625 626
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
						 page_size_mask);
627
		unmap_low_page(pud);
628 629 630 631

		spin_lock(&init_mm.page_table_lock);
		pgd_populate(&init_mm, pgd, __va(pud_phys));
		spin_unlock(&init_mm.page_table_lock);
T
Thomas Gleixner 已提交
632
	}
633
	__flush_tlb_all();
L
Linus Torvalds 已提交
634

635 636
	return last_map_addr;
}
637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662

struct map_range {
	unsigned long start;
	unsigned long end;
	unsigned page_size_mask;
};

#define NR_RANGE_MR 5

static int save_mr(struct map_range *mr, int nr_range,
		   unsigned long start_pfn, unsigned long end_pfn,
		   unsigned long page_size_mask)
{

	if (start_pfn < end_pfn) {
		if (nr_range >= NR_RANGE_MR)
			panic("run out of range for init_memory_mapping\n");
		mr[nr_range].start = start_pfn<<PAGE_SHIFT;
		mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
		mr[nr_range].page_size_mask = page_size_mask;
		nr_range++;
	}

	return nr_range;
}

663 664 665 666 667 668 669 670
/*
 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
 * This runs before bootmem is initialized and gets pages directly from
 * the physical memory. To access them they are temporarily mapped.
 */
unsigned long __init_refok init_memory_mapping(unsigned long start,
					       unsigned long end)
{
671
	unsigned long last_map_addr = 0;
672
	unsigned long page_size_mask = 0;
673
	unsigned long start_pfn, end_pfn;
674
	unsigned long pos;
675

676 677
	struct map_range mr[NR_RANGE_MR];
	int nr_range, i;
678
	int use_pse, use_gbpages;
679

680
	printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
681 682 683 684 685 686 687 688

	/*
	 * Find space for the kernel direct mapping tables.
	 *
	 * Later we should allocate these tables in the local node of the
	 * memory mapped. Unfortunately this is done currently before the
	 * nodes are discovered.
	 */
689
	if (!after_bootmem)
690 691
		init_gbpages();

692 693 694 695 696 697 698 699 700 701 702 703 704
#ifdef CONFIG_DEBUG_PAGEALLOC
	/*
	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
	 * This will simplify cpa(), which otherwise needs to support splitting
	 * large pages into small in interrupt context, etc.
	 */
	use_pse = use_gbpages = 0;
#else
	use_pse = cpu_has_pse;
	use_gbpages = direct_gbpages;
#endif

	if (use_gbpages)
705
		page_size_mask |= 1 << PG_LEVEL_1G;
706
	if (use_pse)
707 708
		page_size_mask |= 1 << PG_LEVEL_2M;

709 710 711 712
	memset(mr, 0, sizeof(mr));
	nr_range = 0;

	/* head if not big page alignment ?*/
713
	start_pfn = start >> PAGE_SHIFT;
714 715
	pos = start_pfn << PAGE_SHIFT;
	end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
716
			<< (PMD_SHIFT - PAGE_SHIFT);
717 718 719 720
	if (start_pfn < end_pfn) {
		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
		pos = end_pfn << PAGE_SHIFT;
	}
721 722

	/* big page (2M) range*/
723
	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
724
			 << (PMD_SHIFT - PAGE_SHIFT);
725
	end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
726
			 << (PUD_SHIFT - PAGE_SHIFT);
727 728 729 730 731 732 733
	if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
		end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
	if (start_pfn < end_pfn) {
		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
				page_size_mask & (1<<PG_LEVEL_2M));
		pos = end_pfn << PAGE_SHIFT;
	}
734 735

	/* big page (1G) range */
736 737 738 739 740
	start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
			 << (PUD_SHIFT - PAGE_SHIFT);
	end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
	if (start_pfn < end_pfn) {
		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
741 742
				page_size_mask &
				 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
743 744
		pos = end_pfn << PAGE_SHIFT;
	}
745 746

	/* tail is not big page (1G) alignment */
747 748 749 750 751 752 753 754
	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
			 << (PMD_SHIFT - PAGE_SHIFT);
	end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
	if (start_pfn < end_pfn) {
		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
				page_size_mask & (1<<PG_LEVEL_2M));
		pos = end_pfn << PAGE_SHIFT;
	}
755

756
	/* tail is not big page (2M) alignment */
757
	start_pfn = pos>>PAGE_SHIFT;
758
	end_pfn = end>>PAGE_SHIFT;
759 760
	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);

Y
Yinghai Lu 已提交
761 762 763 764 765 766 767 768 769 770
	/* try to merge same page size and continuous */
	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
		unsigned long old_start;
		if (mr[i].end != mr[i+1].start ||
		    mr[i].page_size_mask != mr[i+1].page_size_mask)
			continue;
		/* move it */
		old_start = mr[i].start;
		memmove(&mr[i], &mr[i+1],
			 (nr_range - 1 - i) * sizeof (struct map_range));
771
		mr[i--].start = old_start;
Y
Yinghai Lu 已提交
772 773 774
		nr_range--;
	}

775 776 777 778 779 780 781
	for (i = 0; i < nr_range; i++)
		printk(KERN_DEBUG " %010lx - %010lx page %s\n",
				mr[i].start, mr[i].end,
			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));

	if (!after_bootmem)
782
		find_early_table_space(end, use_pse, use_gbpages);
783 784

	for (i = 0; i < nr_range; i++)
785
		last_map_addr = kernel_physical_mapping_init(
786 787
					mr[i].start, mr[i].end,
					mr[i].page_size_mask);
788

789
	if (!after_bootmem)
790
		mmu_cr4_features = read_cr4();
L
Linus Torvalds 已提交
791
	__flush_tlb_all();
792

793
	if (!after_bootmem && table_end > table_start)
794 795
		reserve_early(table_start << PAGE_SHIFT,
				 table_end << PAGE_SHIFT, "PGTABLE");
Y
Yinghai Lu 已提交
796

797 798 799
	printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
			 last_map_addr, end);

Y
Yinghai Lu 已提交
800
	if (!after_bootmem)
801
		early_memtest(start, end);
802

803
	return last_map_addr >> PAGE_SHIFT;
L
Linus Torvalds 已提交
804 805
}

806
#ifndef CONFIG_NUMA
807 808 809 810 811 812 813 814 815
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long bootmap_size, bootmap;

	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
				 PAGE_SIZE);
	if (bootmap == -1L)
		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
Y
Yinghai Lu 已提交
816 817 818
	/* don't touch min_low_pfn */
	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
					 0, end_pfn);
819 820 821 822 823 824
	e820_register_active_regions(0, start_pfn, end_pfn);
	free_bootmem_with_active_regions(0, end_pfn);
	early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
	reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
}

L
Linus Torvalds 已提交
825 826
void __init paging_init(void)
{
827
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
828

829 830 831
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
Y
Yinghai Lu 已提交
832
	max_zone_pfns[ZONE_NORMAL] = max_pfn;
833

Y
Yinghai Lu 已提交
834
	memory_present(0, 0, max_pfn);
835
	sparse_init();
836
	free_area_init_nodes(max_zone_pfns);
L
Linus Torvalds 已提交
837 838 839
}
#endif

840 841 842
/*
 * Memory hotplug specific functions
 */
843
#ifdef CONFIG_MEMORY_HOTPLUG
844 845 846 847
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
848
int arch_add_memory(int nid, u64 start, u64 size)
849
{
850
	struct pglist_data *pgdat = NODE_DATA(nid);
851
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
852
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
853 854 855
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

856
	last_mapped_pfn = init_memory_mapping(start, start + size);
857 858
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
859

860
	ret = __add_pages(nid, zone, start_pfn, nr_pages);
861
	WARN_ON_ONCE(ret);
862 863 864

	return ret;
}
865
EXPORT_SYMBOL_GPL(arch_add_memory);
866

867
#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
868 869 870 871
int memory_add_physaddr_to_nid(u64 start)
{
	return 0;
}
872
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
873 874
#endif

875 876
#endif /* CONFIG_MEMORY_HOTPLUG */

877 878 879 880 881 882 883 884 885 886 887 888 889 890
/*
 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
 * is valid. The argument is a physical page number.
 *
 *
 * On x86, access has to be given to the first megabyte of ram because that area
 * contains bios code and data regions used by X and dosemu and similar apps.
 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
 * mmio resources as well as potential bios/acpi data regions.
 */
int devmem_is_allowed(unsigned long pagenr)
{
	if (pagenr <= 256)
		return 1;
891 892
	if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
		return 0;
893 894 895 896 897 898
	if (!page_is_ram(pagenr))
		return 1;
	return 0;
}


T
Thomas Gleixner 已提交
899 900
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
			 kcore_modules, kcore_vsyscall;
L
Linus Torvalds 已提交
901 902 903

void __init mem_init(void)
{
904
	long codesize, reservedpages, datasize, initsize;
905
	unsigned long absent_pages;
L
Linus Torvalds 已提交
906

907
	pci_iommu_alloc();
L
Linus Torvalds 已提交
908

909
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
910 911 912 913

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
914
#ifdef CONFIG_NUMA
915
	totalram_pages = numa_free_all_bootmem();
L
Linus Torvalds 已提交
916
#else
917
	totalram_pages = free_all_bootmem();
L
Linus Torvalds 已提交
918
#endif
919 920 921

	absent_pages = absent_pages_in_range(0, max_pfn);
	reservedpages = max_pfn - totalram_pages - absent_pages;
L
Linus Torvalds 已提交
922 923 924 925 926 927 928
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
929 930
	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
L
Linus Torvalds 已提交
931 932 933
		   VMALLOC_END-VMALLOC_START);
	kclist_add(&kcore_kernel, &_stext, _end - _stext);
	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
T
Thomas Gleixner 已提交
934
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
L
Linus Torvalds 已提交
935 936
				 VSYSCALL_END - VSYSCALL_START);

937
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
938
			 "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
L
Linus Torvalds 已提交
939
		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
Y
Yinghai Lu 已提交
940
		max_pfn << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
941
		codesize >> 10,
942
		absent_pages << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
943 944 945 946 947
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
}

948
void free_init_pages(char *what, unsigned long begin, unsigned long end)
L
Linus Torvalds 已提交
949
{
950
	unsigned long addr = begin;
L
Linus Torvalds 已提交
951

952
	if (addr >= end)
953 954
		return;

I
Ingo Molnar 已提交
955 956 957 958 959 960 961 962 963 964
	/*
	 * If debugging page accesses then do not free this memory but
	 * mark them not present - any buggy init-section access will
	 * create a kernel page fault:
	 */
#ifdef CONFIG_DEBUG_PAGEALLOC
	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
		begin, PAGE_ALIGN(end));
	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
#else
965
	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
T
Thomas Gleixner 已提交
966

967
	for (; addr < end; addr += PAGE_SIZE) {
968 969 970 971 972
		ClearPageReserved(virt_to_page(addr));
		init_page_count(virt_to_page(addr));
		memset((void *)(addr & ~(PAGE_SIZE-1)),
			POISON_FREE_INITMEM, PAGE_SIZE);
		free_page(addr);
L
Linus Torvalds 已提交
973 974
		totalram_pages++;
	}
I
Ingo Molnar 已提交
975
#endif
976 977 978 979 980
}

void free_initmem(void)
{
	free_init_pages("unused kernel memory",
981 982
			(unsigned long)(&__init_begin),
			(unsigned long)(&__init_end));
L
Linus Torvalds 已提交
983 984
}

985
#ifdef CONFIG_DEBUG_RODATA
986 987
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
988

989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018
static int kernel_set_to_readonly;

void set_kernel_text_rw(void)
{
	unsigned long start = PFN_ALIGN(_stext);
	unsigned long end = PFN_ALIGN(__start_rodata);

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read write\n",
		 start, end);

	set_memory_rw(start, (end - start) >> PAGE_SHIFT);
}

void set_kernel_text_ro(void)
{
	unsigned long start = PFN_ALIGN(_stext);
	unsigned long end = PFN_ALIGN(__start_rodata);

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read only\n",
		 start, end);

	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
}

1019 1020
void mark_rodata_ro(void)
{
1021
	unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
1022 1023 1024
	unsigned long rodata_start =
		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;

1025
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
1026
	       (end - start) >> 10);
1027 1028
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

1029 1030
	kernel_set_to_readonly = 1;

1031 1032 1033 1034
	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
1035
	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
1036

1037 1038
	rodata_test();

1039
#ifdef CONFIG_CPA_DEBUG
1040
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
1041
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
1042

1043
	printk(KERN_INFO "Testing CPA: again\n");
1044
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
1045
#endif
1046
}
1047

1048 1049
#endif

L
Linus Torvalds 已提交
1050 1051 1052
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
1053
	free_init_pages("initrd memory", start, end);
L
Linus Torvalds 已提交
1054 1055 1056
}
#endif

1057 1058
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
				   int flags)
T
Thomas Gleixner 已提交
1059
{
1060
#ifdef CONFIG_NUMA
1061
	int nid, next_nid;
1062
	int ret;
1063 1064
#endif
	unsigned long pfn = phys >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
1065

Y
Yinghai Lu 已提交
1066
	if (pfn >= max_pfn) {
T
Thomas Gleixner 已提交
1067 1068 1069 1070
		/*
		 * This can happen with kdump kernels when accessing
		 * firmware tables:
		 */
1071
		if (pfn < max_pfn_mapped)
1072
			return -EFAULT;
T
Thomas Gleixner 已提交
1073

1074
		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
1075
				phys, len);
1076
		return -EFAULT;
1077 1078 1079 1080
	}

	/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
1081 1082 1083
	nid = phys_to_nid(phys);
	next_nid = phys_to_nid(phys + len - 1);
	if (nid == next_nid)
1084
		ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
1085
	else
1086 1087 1088 1089 1090
		ret = reserve_bootmem(phys, len, flags);

	if (ret != 0)
		return ret;

T
Thomas Gleixner 已提交
1091
#else
1092
	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
L
Linus Torvalds 已提交
1093
#endif
1094

1095
	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
1096
		dma_reserve += len / PAGE_SIZE;
1097 1098
		set_dma_reserve(dma_reserve);
	}
1099 1100

	return 0;
L
Linus Torvalds 已提交
1101 1102
}

T
Thomas Gleixner 已提交
1103 1104
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
1105
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
1106 1107 1108 1109
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
1110 1111

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
1112 1113
		return 0;

L
Linus Torvalds 已提交
1114 1115 1116 1117 1118 1119
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
1120
		return 0;
L
Linus Torvalds 已提交
1121 1122 1123 1124

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
1125

L
Linus Torvalds 已提交
1126 1127 1128 1129 1130 1131
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
1132

L
Linus Torvalds 已提交
1133 1134 1135
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
1136 1137 1138 1139 1140
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
1141
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
1142 1143 1144 1145
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
1146 1147 1148 1149 1150
};

struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
#ifdef CONFIG_IA32_EMULATION
1151 1152
	if (test_tsk_thread_flag(tsk, TIF_IA32))
		return NULL;
L
Linus Torvalds 已提交
1153 1154 1155 1156 1157 1158 1159
#endif
	return &gate_vma;
}

int in_gate_area(struct task_struct *task, unsigned long addr)
{
	struct vm_area_struct *vma = get_gate_vma(task);
T
Thomas Gleixner 已提交
1160

1161 1162
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
1163

L
Linus Torvalds 已提交
1164 1165 1166
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
1167 1168 1169 1170
/*
 * Use this when you have no reliable task/vma, typically from interrupt
 * context. It is less reliable than using the task's vma and may give
 * false positives:
L
Linus Torvalds 已提交
1171 1172 1173
 */
int in_gate_area_no_task(unsigned long addr)
{
1174
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
1175
}
1176

1177 1178 1179 1180 1181 1182 1183 1184
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
1185 1186 1187 1188 1189

#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
1190 1191 1192 1193
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

T
Thomas Gleixner 已提交
1194 1195
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
1196 1197 1198 1199 1200 1201 1202 1203 1204
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
1205
		void *p = NULL;
1206 1207 1208 1209

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
1210

1211 1212 1213 1214
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

1215 1216 1217 1218 1219 1220 1221 1222
		if (!cpu_has_pse) {
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			pmd = vmemmap_pmd_populate(pud, addr, node);

			if (!pmd)
				return -ENOMEM;

			p = vmemmap_pte_populate(pmd, addr, node);
T
Thomas Gleixner 已提交
1223

1224 1225 1226
			if (!p)
				return -ENOMEM;

1227 1228
			addr_end = addr + PAGE_SIZE;
			p_end = p + PAGE_SIZE;
T
Thomas Gleixner 已提交
1229
		} else {
1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252
			next = pmd_addr_end(addr, end);

			pmd = pmd_offset(pud, addr);
			if (pmd_none(*pmd)) {
				pte_t entry;

				p = vmemmap_alloc_block(PMD_SIZE, node);
				if (!p)
					return -ENOMEM;

				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
						PAGE_KERNEL_LARGE);
				set_pmd(pmd, __pmd(pte_val(entry)));

				/* check to see if we have contiguous blocks */
				if (p_end != p || node_start != node) {
					if (p_start)
						printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						       addr_start, addr_end-1, p_start, p_end-1, node_start);
					addr_start = addr;
					node_start = node;
					p_start = p;
				}
Y
Yinghai Lu 已提交
1253 1254 1255

				addr_end = addr + PMD_SIZE;
				p_end = p + PMD_SIZE;
1256 1257
			} else
				vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
1258
		}
1259

1260 1261 1262
	}
	return 0;
}
1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273

void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
1274
#endif