init_64.c 30.4 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
T
Thomas Gleixner 已提交
21
#include <linux/initrd.h>
L
Linus Torvalds 已提交
22 23 24
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
25
#include <linux/pci.h>
26
#include <linux/pfn.h>
27
#include <linux/poison.h>
28
#include <linux/dma-mapping.h>
29 30
#include <linux/module.h>
#include <linux/memory_hotplug.h>
31
#include <linux/nmi.h>
L
Linus Torvalds 已提交
32 33

#include <asm/processor.h>
34
#include <asm/bios_ebda.h>
L
Linus Torvalds 已提交
35 36 37 38 39 40 41 42 43 44 45 46
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
47
#include <asm/sections.h>
48
#include <asm/kdebug.h>
49
#include <asm/numa.h>
50
#include <asm/cacheflush.h>
L
Linus Torvalds 已提交
51

52 53 54 55 56
/*
 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
 * The direct mapping extends to max_pfn_mapped, so that we can directly access
 * apertures, ACPI and other tables without having to play with fixmaps.
 */
57
unsigned long max_low_pfn_mapped;
58 59
unsigned long max_pfn_mapped;

60 61
static unsigned long dma_reserve __initdata;

L
Linus Torvalds 已提交
62 63
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);

H
Hugh Dickins 已提交
64
int direct_gbpages
I
Ingo Molnar 已提交
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
#ifdef CONFIG_DIRECT_GBPAGES
				= 1
#endif
;

static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

L
Linus Torvalds 已提交
84 85 86 87 88 89 90 91
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

int after_bootmem;

92 93 94 95 96
unsigned long __supported_pte_mask __read_mostly = ~0UL;
EXPORT_SYMBOL_GPL(__supported_pte_mask);

static int do_not_nx __cpuinitdata;

I
Ingo Molnar 已提交
97 98 99 100 101 102 103
/*
 * noexec=on|off
 * Control non-executable mappings for 64-bit processes.
 *
 * on	Enable (default)
 * off	Disable
 */
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
static int __init nonx_setup(char *str)
{
	if (!str)
		return -EINVAL;
	if (!strncmp(str, "on", 2)) {
		__supported_pte_mask |= _PAGE_NX;
		do_not_nx = 0;
	} else if (!strncmp(str, "off", 3)) {
		do_not_nx = 1;
		__supported_pte_mask &= ~_PAGE_NX;
	}
	return 0;
}
early_param("noexec", nonx_setup);

void __cpuinit check_efer(void)
{
	unsigned long efer;

	rdmsrl(MSR_EFER, efer);
	if (!(efer & EFER_NX) || do_not_nx)
		__supported_pte_mask &= ~_PAGE_NX;
}

int force_personality32;

I
Ingo Molnar 已提交
130 131 132 133 134 135 136 137
/*
 * noexec32=on|off
 * Control non executable heap for 32bit processes.
 * To control the stack too use noexec=off
 *
 * on	PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
 * off	PROT_READ implies PROT_EXEC
 */
138 139 140 141 142 143 144 145 146 147
static int __init nonx32_setup(char *str)
{
	if (!strcmp(str, "on"))
		force_personality32 &= ~READ_IMPLIES_EXEC;
	else if (!strcmp(str, "off"))
		force_personality32 |= READ_IMPLIES_EXEC;
	return 1;
}
__setup("noexec32=", nonx32_setup);

148 149 150 151 152
/*
 * NOTE: This function is marked __ref because it calls __init function
 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
 */
static __ref void *spp_getpage(void)
T
Thomas Gleixner 已提交
153
{
L
Linus Torvalds 已提交
154
	void *ptr;
T
Thomas Gleixner 已提交
155

L
Linus Torvalds 已提交
156
	if (after_bootmem)
T
Thomas Gleixner 已提交
157
		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
L
Linus Torvalds 已提交
158 159
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
T
Thomas Gleixner 已提交
160 161 162 163 164

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
L
Linus Torvalds 已提交
165

166
	pr_debug("spp_getpage %p\n", ptr);
T
Thomas Gleixner 已提交
167

L
Linus Torvalds 已提交
168
	return ptr;
T
Thomas Gleixner 已提交
169
}
L
Linus Torvalds 已提交
170

171
void
172
set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
L
Linus Torvalds 已提交
173 174 175
{
	pud_t *pud;
	pmd_t *pmd;
176
	pte_t *pte;
L
Linus Torvalds 已提交
177

178
	pud = pud_page + pud_index(vaddr);
L
Linus Torvalds 已提交
179
	if (pud_none(*pud)) {
T
Thomas Gleixner 已提交
180
		pmd = (pmd_t *) spp_getpage();
181
		pud_populate(&init_mm, pud, pmd);
L
Linus Torvalds 已提交
182
		if (pmd != pmd_offset(pud, 0)) {
183
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
T
Thomas Gleixner 已提交
184
				pmd, pmd_offset(pud, 0));
L
Linus Torvalds 已提交
185 186 187 188 189 190
			return;
		}
	}
	pmd = pmd_offset(pud, vaddr);
	if (pmd_none(*pmd)) {
		pte = (pte_t *) spp_getpage();
191
		pmd_populate_kernel(&init_mm, pmd, pte);
L
Linus Torvalds 已提交
192
		if (pte != pte_offset_kernel(pmd, 0)) {
193
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
L
Linus Torvalds 已提交
194 195 196 197 198
			return;
		}
	}

	pte = pte_offset_kernel(pmd, vaddr);
I
Ingo Molnar 已提交
199
	if (!pte_none(*pte) && pte_val(new_pte) &&
L
Linus Torvalds 已提交
200 201 202 203 204 205 206 207 208 209 210
	    pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
		pte_ERROR(*pte);
	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
void
set_pte_vaddr(unsigned long vaddr, pte_t pteval)
{
	pgd_t *pgd;
	pud_t *pud_page;

	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
		return;
	}
	pud_page = (pud_t*)pgd_page_vaddr(*pgd);
	set_pte_vaddr_pud(pud_page, vaddr, pteval);
}

229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
/*
 * Create large page table mappings for a range of physical addresses.
 */
static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
						pgprot_t prot)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
		pgd = pgd_offset_k((unsigned long)__va(phys));
		if (pgd_none(*pgd)) {
			pud = (pud_t *) spp_getpage();
			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pud = pud_offset(pgd, (unsigned long)__va(phys));
		if (pud_none(*pud)) {
			pmd = (pmd_t *) spp_getpage();
			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pmd = pmd_offset(pud, phys);
		BUG_ON(!pmd_none(*pmd));
		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
	}
}

void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
}

void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
}

269
/*
270 271 272
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
273 274 275 276 277 278 279 280 281 282 283 284
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
 * We limit the mappings to the region from _text to _end.  _end is
 * rounded up to the 2MB boundary. This catches the invalid pmds as
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
285
	unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
286 287 288 289
	pmd_t *pmd = level2_kernel_pgt;
	pmd_t *last_pmd = pmd + PTRS_PER_PMD;

	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
290
		if (pmd_none(*pmd))
291 292 293 294 295 296
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

297 298
static unsigned long __initdata table_start;
static unsigned long __meminitdata table_end;
299
static unsigned long __meminitdata table_top;
L
Linus Torvalds 已提交
300

301
static __ref void *alloc_low_page(unsigned long *phys)
T
Thomas Gleixner 已提交
302
{
303
	unsigned long pfn = table_end++;
L
Linus Torvalds 已提交
304 305
	void *adr;

306 307 308
	if (after_bootmem) {
		adr = (void *)get_zeroed_page(GFP_ATOMIC);
		*phys = __pa(adr);
T
Thomas Gleixner 已提交
309

310 311 312
		return adr;
	}

313
	if (pfn >= table_top)
T
Thomas Gleixner 已提交
314
		panic("alloc_low_page: ran out of memory");
315 316

	adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
317
	memset(adr, 0, PAGE_SIZE);
318 319 320
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
L
Linus Torvalds 已提交
321

322
static __ref void unmap_low_page(void *adr)
T
Thomas Gleixner 已提交
323
{
324 325 326
	if (after_bootmem)
		return;

327
	early_iounmap(adr, PAGE_SIZE);
T
Thomas Gleixner 已提交
328
}
L
Linus Torvalds 已提交
329

330
static unsigned long __meminit
331 332
phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
	      pgprot_t prot)
333 334
{
	unsigned pages = 0;
335
	unsigned long last_map_addr = end;
336
	int i;
337

338 339 340 341 342 343 344 345 346 347 348 349
	pte_t *pte = pte_page + pte_index(addr);

	for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {

		if (addr >= end) {
			if (!after_bootmem) {
				for(; i < PTRS_PER_PTE; i++, pte++)
					set_pte(pte, __pte(0));
			}
			break;
		}

350 351 352 353 354 355
		/*
		 * We will re-use the existing mapping.
		 * Xen for example has some special requirements, like mapping
		 * pagetable pages as RO. So assume someone who pre-setup
		 * these mappings are more intelligent.
		 */
356 357 358 359 360 361 362
		if (pte_val(*pte))
			continue;

		if (0)
			printk("   pte=%p addr=%lx pte=%016lx\n",
			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
		pages++;
363
		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
364
		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
365
	}
366

367
	update_page_count(PG_LEVEL_4K, pages);
368 369

	return last_map_addr;
370 371
}

372
static unsigned long __meminit
373 374
phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
		pgprot_t prot)
375 376 377
{
	pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);

378
	return phys_pte_init(pte, address, end, prot);
379 380
}

381
static unsigned long __meminit
382
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
383
	      unsigned long page_size_mask, pgprot_t prot)
384
{
385
	unsigned long pages = 0;
386
	unsigned long last_map_addr = end;
387

388
	int i = pmd_index(address);
389

390
	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
391
		unsigned long pte_phys;
392
		pmd_t *pmd = pmd_page + pmd_index(address);
393
		pte_t *pte;
394
		pgprot_t new_prot = prot;
395

396
		if (address >= end) {
T
Thomas Gleixner 已提交
397
			if (!after_bootmem) {
398 399
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
T
Thomas Gleixner 已提交
400
			}
401 402
			break;
		}
403

404
		if (pmd_val(*pmd)) {
405 406
			if (!pmd_large(*pmd)) {
				spin_lock(&init_mm.page_table_lock);
407
				last_map_addr = phys_pte_update(pmd, address,
408
								end, prot);
409
				spin_unlock(&init_mm.page_table_lock);
410
				continue;
411
			}
412 413 414 415 416 417 418 419 420 421 422 423 424 425 426
			/*
			 * If we are ok with PG_LEVEL_2M mapping, then we will
			 * use the existing mapping,
			 *
			 * Otherwise, we will split the large page mapping but
			 * use the same existing protection bits except for
			 * large page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
			if (page_size_mask & (1 << PG_LEVEL_2M))
				continue;
			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
427 428
		}

429
		if (page_size_mask & (1<<PG_LEVEL_2M)) {
430
			pages++;
431
			spin_lock(&init_mm.page_table_lock);
432
			set_pte((pte_t *)pmd,
433 434
				pfn_pte(address >> PAGE_SHIFT,
					__pgprot(pgprot_val(prot) | _PAGE_PSE)));
435
			spin_unlock(&init_mm.page_table_lock);
436
			last_map_addr = (address & PMD_MASK) + PMD_SIZE;
437
			continue;
438
		}
439

440
		pte = alloc_low_page(&pte_phys);
441
		last_map_addr = phys_pte_init(pte, address, end, new_prot);
442 443
		unmap_low_page(pte);

444
		spin_lock(&init_mm.page_table_lock);
445
		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
446
		spin_unlock(&init_mm.page_table_lock);
447
	}
448
	update_page_count(PG_LEVEL_2M, pages);
449
	return last_map_addr;
450 451
}

452
static unsigned long __meminit
453
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
454
		unsigned long page_size_mask, pgprot_t prot)
455
{
T
Thomas Gleixner 已提交
456
	pmd_t *pmd = pmd_offset(pud, 0);
457 458
	unsigned long last_map_addr;

459
	last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
460
	__flush_tlb_all();
461
	return last_map_addr;
462 463
}

464
static unsigned long __meminit
465 466
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
			 unsigned long page_size_mask)
T
Thomas Gleixner 已提交
467
{
468
	unsigned long pages = 0;
469
	unsigned long last_map_addr = end;
470
	int i = pud_index(addr);
471

T
Thomas Gleixner 已提交
472
	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
473 474
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
L
Linus Torvalds 已提交
475
		pmd_t *pmd;
476
		pgprot_t prot = PAGE_KERNEL;
L
Linus Torvalds 已提交
477

478
		if (addr >= end)
L
Linus Torvalds 已提交
479 480
			break;

T
Thomas Gleixner 已提交
481 482 483
		if (!after_bootmem &&
				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
			set_pud(pud, __pud(0));
L
Linus Torvalds 已提交
484
			continue;
T
Thomas Gleixner 已提交
485
		}
L
Linus Torvalds 已提交
486

487
		if (pud_val(*pud)) {
488
			if (!pud_large(*pud)) {
489
				last_map_addr = phys_pmd_update(pud, addr, end,
490
							 page_size_mask, prot);
491 492
				continue;
			}
493 494 495 496 497 498 499 500 501 502 503 504 505 506 507
			/*
			 * If we are ok with PG_LEVEL_1G mapping, then we will
			 * use the existing mapping.
			 *
			 * Otherwise, we will split the gbpage mapping but use
			 * the same existing protection  bits except for large
			 * page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
			if (page_size_mask & (1 << PG_LEVEL_1G))
				continue;
			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
508 509
		}

510
		if (page_size_mask & (1<<PG_LEVEL_1G)) {
511
			pages++;
512
			spin_lock(&init_mm.page_table_lock);
513 514
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
515
			spin_unlock(&init_mm.page_table_lock);
516
			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
517 518 519
			continue;
		}

520
		pmd = alloc_low_page(&pmd_phys);
521 522
		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
					      prot);
523
		unmap_low_page(pmd);
524 525

		spin_lock(&init_mm.page_table_lock);
526
		pud_populate(&init_mm, pud, __va(pmd_phys));
527
		spin_unlock(&init_mm.page_table_lock);
L
Linus Torvalds 已提交
528
	}
A
Andi Kleen 已提交
529
	__flush_tlb_all();
530

531
	update_page_count(PG_LEVEL_1G, pages);
532

533
	return last_map_addr;
T
Thomas Gleixner 已提交
534
}
L
Linus Torvalds 已提交
535

536
static unsigned long __meminit
537 538
phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
		 unsigned long page_size_mask)
539 540 541 542 543
{
	pud_t *pud;

	pud = (pud_t *)pgd_page_vaddr(*pgd);

544
	return phys_pud_init(pud, addr, end, page_size_mask);
545 546
}

547 548
static void __init find_early_table_space(unsigned long end, int use_pse,
					  int use_gbpages)
L
Linus Torvalds 已提交
549
{
550
	unsigned long puds, pmds, ptes, tables, start;
L
Linus Torvalds 已提交
551 552

	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
553
	tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
554
	if (use_gbpages) {
555 556 557 558 559
		unsigned long extra;
		extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
		pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
	} else
		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
560
	tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
561

562
	if (use_pse) {
563 564 565 566 567
		unsigned long extra;
		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
	} else
		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
568
	tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
L
Linus Torvalds 已提交
569

T
Thomas Gleixner 已提交
570 571 572 573 574 575
	/*
	 * RED-PEN putting page tables only on node 0 could
	 * cause a hotspot and fill up ZONE_DMA. The page tables
	 * need roughly 0.5KB per GB.
	 */
	start = 0x8000;
576
	table_start = find_e820_area(start, end, tables, PAGE_SIZE);
L
Linus Torvalds 已提交
577 578 579 580 581
	if (table_start == -1UL)
		panic("Cannot find space for the kernel page tables");

	table_start >>= PAGE_SHIFT;
	table_end = table_start;
582
	table_top = table_start + (tables >> PAGE_SHIFT);
583

584 585
	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
		end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
L
Linus Torvalds 已提交
586 587
}

588 589 590 591 592 593 594 595
static void __init init_gbpages(void)
{
	if (direct_gbpages && cpu_has_gbpages)
		printk(KERN_INFO "Using GB pages for direct mapping\n");
	else
		direct_gbpages = 0;
}

596 597 598
static unsigned long __init kernel_physical_mapping_init(unsigned long start,
						unsigned long end,
						unsigned long page_size_mask)
T
Thomas Gleixner 已提交
599
{
L
Linus Torvalds 已提交
600

601
	unsigned long next, last_map_addr = end;
L
Linus Torvalds 已提交
602 603 604 605 606

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);

	for (; start < end; start = next) {
607
		pgd_t *pgd = pgd_offset_k(start);
T
Thomas Gleixner 已提交
608
		unsigned long pud_phys;
609 610
		pud_t *pud;

611
		next = (start + PGDIR_SIZE) & PGDIR_MASK;
612 613 614 615
		if (next > end)
			next = end;

		if (pgd_val(*pgd)) {
616 617
			last_map_addr = phys_pud_update(pgd, __pa(start),
						 __pa(end), page_size_mask);
618 619 620
			continue;
		}

621
		pud = alloc_low_page(&pud_phys);
622 623
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
						 page_size_mask);
624
		unmap_low_page(pud);
625 626 627 628

		spin_lock(&init_mm.page_table_lock);
		pgd_populate(&init_mm, pgd, __va(pud_phys));
		spin_unlock(&init_mm.page_table_lock);
T
Thomas Gleixner 已提交
629
	}
630
	__flush_tlb_all();
L
Linus Torvalds 已提交
631

632 633
	return last_map_addr;
}
634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659

struct map_range {
	unsigned long start;
	unsigned long end;
	unsigned page_size_mask;
};

#define NR_RANGE_MR 5

static int save_mr(struct map_range *mr, int nr_range,
		   unsigned long start_pfn, unsigned long end_pfn,
		   unsigned long page_size_mask)
{

	if (start_pfn < end_pfn) {
		if (nr_range >= NR_RANGE_MR)
			panic("run out of range for init_memory_mapping\n");
		mr[nr_range].start = start_pfn<<PAGE_SHIFT;
		mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
		mr[nr_range].page_size_mask = page_size_mask;
		nr_range++;
	}

	return nr_range;
}

660 661 662 663 664 665 666 667
/*
 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
 * This runs before bootmem is initialized and gets pages directly from
 * the physical memory. To access them they are temporarily mapped.
 */
unsigned long __init_refok init_memory_mapping(unsigned long start,
					       unsigned long end)
{
668
	unsigned long last_map_addr = 0;
669
	unsigned long page_size_mask = 0;
670
	unsigned long start_pfn, end_pfn;
671

672 673
	struct map_range mr[NR_RANGE_MR];
	int nr_range, i;
674
	int use_pse, use_gbpages;
675

676 677 678 679 680 681 682 683 684
	printk(KERN_INFO "init_memory_mapping\n");

	/*
	 * Find space for the kernel direct mapping tables.
	 *
	 * Later we should allocate these tables in the local node of the
	 * memory mapped. Unfortunately this is done currently before the
	 * nodes are discovered.
	 */
685
	if (!after_bootmem)
686 687
		init_gbpages();

688 689 690 691 692 693 694 695 696 697 698 699 700
#ifdef CONFIG_DEBUG_PAGEALLOC
	/*
	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
	 * This will simplify cpa(), which otherwise needs to support splitting
	 * large pages into small in interrupt context, etc.
	 */
	use_pse = use_gbpages = 0;
#else
	use_pse = cpu_has_pse;
	use_gbpages = direct_gbpages;
#endif

	if (use_gbpages)
701
		page_size_mask |= 1 << PG_LEVEL_1G;
702
	if (use_pse)
703 704
		page_size_mask |= 1 << PG_LEVEL_2M;

705 706 707 708
	memset(mr, 0, sizeof(mr));
	nr_range = 0;

	/* head if not big page alignment ?*/
709 710 711
	start_pfn = start >> PAGE_SHIFT;
	end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
			<< (PMD_SHIFT - PAGE_SHIFT);
712
	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
713 714 715 716 717 718 719 720

	/* big page (2M) range*/
	start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
			 << (PMD_SHIFT - PAGE_SHIFT);
	end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
			 << (PUD_SHIFT - PAGE_SHIFT);
	if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
		end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
721 722
	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
			page_size_mask & (1<<PG_LEVEL_2M));
723 724 725 726

	/* big page (1G) range */
	start_pfn = end_pfn;
	end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
727 728 729
	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
				page_size_mask &
				 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
730 731 732 733

	/* tail is not big page (1G) alignment */
	start_pfn = end_pfn;
	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
734 735 736
	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
			page_size_mask & (1<<PG_LEVEL_2M));

737 738 739
	/* tail is not big page (2M) alignment */
	start_pfn = end_pfn;
	end_pfn = end>>PAGE_SHIFT;
740 741
	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);

Y
Yinghai Lu 已提交
742 743 744 745 746 747 748 749 750 751 752 753 754 755
	/* try to merge same page size and continuous */
	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
		unsigned long old_start;
		if (mr[i].end != mr[i+1].start ||
		    mr[i].page_size_mask != mr[i+1].page_size_mask)
			continue;
		/* move it */
		old_start = mr[i].start;
		memmove(&mr[i], &mr[i+1],
			 (nr_range - 1 - i) * sizeof (struct map_range));
		mr[i].start = old_start;
		nr_range--;
	}

756 757 758 759 760 761 762
	for (i = 0; i < nr_range; i++)
		printk(KERN_DEBUG " %010lx - %010lx page %s\n",
				mr[i].start, mr[i].end,
			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));

	if (!after_bootmem)
763
		find_early_table_space(end, use_pse, use_gbpages);
764 765

	for (i = 0; i < nr_range; i++)
766
		last_map_addr = kernel_physical_mapping_init(
767 768
					mr[i].start, mr[i].end,
					mr[i].page_size_mask);
769

770
	if (!after_bootmem)
771
		mmu_cr4_features = read_cr4();
L
Linus Torvalds 已提交
772
	__flush_tlb_all();
773

774
	if (!after_bootmem && table_end > table_start)
775 776
		reserve_early(table_start << PAGE_SHIFT,
				 table_end << PAGE_SHIFT, "PGTABLE");
Y
Yinghai Lu 已提交
777

778 779 780
	printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
			 last_map_addr, end);

Y
Yinghai Lu 已提交
781
	if (!after_bootmem)
782
		early_memtest(start, end);
783

784
	return last_map_addr >> PAGE_SHIFT;
L
Linus Torvalds 已提交
785 786
}

787
#ifndef CONFIG_NUMA
788 789 790 791 792 793 794 795 796
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long bootmap_size, bootmap;

	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
				 PAGE_SIZE);
	if (bootmap == -1L)
		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
Y
Yinghai Lu 已提交
797 798 799
	/* don't touch min_low_pfn */
	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
					 0, end_pfn);
800 801 802 803 804 805
	e820_register_active_regions(0, start_pfn, end_pfn);
	free_bootmem_with_active_regions(0, end_pfn);
	early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
	reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
}

L
Linus Torvalds 已提交
806 807
void __init paging_init(void)
{
808
	unsigned long max_zone_pfns[MAX_NR_ZONES];
T
Thomas Gleixner 已提交
809

810 811 812
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
Y
Yinghai Lu 已提交
813
	max_zone_pfns[ZONE_NORMAL] = max_pfn;
814

Y
Yinghai Lu 已提交
815
	memory_present(0, 0, max_pfn);
816
	sparse_init();
817
	free_area_init_nodes(max_zone_pfns);
L
Linus Torvalds 已提交
818 819 820
}
#endif

821 822 823
/*
 * Memory hotplug specific functions
 */
824
#ifdef CONFIG_MEMORY_HOTPLUG
825 826 827 828
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
829
int arch_add_memory(int nid, u64 start, u64 size)
830
{
831
	struct pglist_data *pgdat = NODE_DATA(nid);
832
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
833
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
834 835 836
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

837 838 839
	last_mapped_pfn = init_memory_mapping(start, start + size-1);
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
840

841
	ret = __add_pages(zone, start_pfn, nr_pages);
842
	WARN_ON(1);
843 844 845

	return ret;
}
846
EXPORT_SYMBOL_GPL(arch_add_memory);
847

848
#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
849 850 851 852
int memory_add_physaddr_to_nid(u64 start)
{
	return 0;
}
853
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
854 855
#endif

856 857
#endif /* CONFIG_MEMORY_HOTPLUG */

858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877
/*
 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
 * is valid. The argument is a physical page number.
 *
 *
 * On x86, access has to be given to the first megabyte of ram because that area
 * contains bios code and data regions used by X and dosemu and similar apps.
 * Access has to be given to non-kernel-ram areas as well, these contain the PCI
 * mmio resources as well as potential bios/acpi data regions.
 */
int devmem_is_allowed(unsigned long pagenr)
{
	if (pagenr <= 256)
		return 1;
	if (!page_is_ram(pagenr))
		return 1;
	return 0;
}


T
Thomas Gleixner 已提交
878 879
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
			 kcore_modules, kcore_vsyscall;
L
Linus Torvalds 已提交
880 881 882

void __init mem_init(void)
{
883
	long codesize, reservedpages, datasize, initsize;
L
Linus Torvalds 已提交
884

885 886
	start_periodic_check_for_corruption();

887
	pci_iommu_alloc();
L
Linus Torvalds 已提交
888

889
	/* clear_bss() already clear the empty_zero_page */
L
Linus Torvalds 已提交
890 891 892 893

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
894
#ifdef CONFIG_NUMA
895
	totalram_pages = numa_free_all_bootmem();
L
Linus Torvalds 已提交
896
#else
897
	totalram_pages = free_all_bootmem();
L
Linus Torvalds 已提交
898
#endif
Y
Yinghai Lu 已提交
899 900
	reservedpages = max_pfn - totalram_pages -
					absent_pages_in_range(0, max_pfn);
L
Linus Torvalds 已提交
901 902 903 904 905 906 907
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
T
Thomas Gleixner 已提交
908 909
	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
L
Linus Torvalds 已提交
910 911 912
		   VMALLOC_END-VMALLOC_START);
	kclist_add(&kcore_kernel, &_stext, _end - _stext);
	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
T
Thomas Gleixner 已提交
913
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
L
Linus Torvalds 已提交
914 915
				 VSYSCALL_END - VSYSCALL_START);

916
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
T
Thomas Gleixner 已提交
917
				"%ldk reserved, %ldk data, %ldk init)\n",
L
Linus Torvalds 已提交
918
		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
Y
Yinghai Lu 已提交
919
		max_pfn << (PAGE_SHIFT-10),
L
Linus Torvalds 已提交
920 921 922 923 924 925
		codesize >> 10,
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
}

926
void free_init_pages(char *what, unsigned long begin, unsigned long end)
L
Linus Torvalds 已提交
927
{
928
	unsigned long addr = begin;
L
Linus Torvalds 已提交
929

930
	if (addr >= end)
931 932
		return;

I
Ingo Molnar 已提交
933 934 935 936 937 938 939 940 941 942
	/*
	 * If debugging page accesses then do not free this memory but
	 * mark them not present - any buggy init-section access will
	 * create a kernel page fault:
	 */
#ifdef CONFIG_DEBUG_PAGEALLOC
	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
		begin, PAGE_ALIGN(end));
	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
#else
943
	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
T
Thomas Gleixner 已提交
944

945
	for (; addr < end; addr += PAGE_SIZE) {
946 947 948 949 950
		ClearPageReserved(virt_to_page(addr));
		init_page_count(virt_to_page(addr));
		memset((void *)(addr & ~(PAGE_SIZE-1)),
			POISON_FREE_INITMEM, PAGE_SIZE);
		free_page(addr);
L
Linus Torvalds 已提交
951 952
		totalram_pages++;
	}
I
Ingo Molnar 已提交
953
#endif
954 955 956 957 958
}

void free_initmem(void)
{
	free_init_pages("unused kernel memory",
959 960
			(unsigned long)(&__init_begin),
			(unsigned long)(&__init_end));
L
Linus Torvalds 已提交
961 962
}

963
#ifdef CONFIG_DEBUG_RODATA
964 965
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
966 967 968

void mark_rodata_ro(void)
{
969
	unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
970 971 972 973 974 975 976
	unsigned long rodata_start =
		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;

#ifdef CONFIG_DYNAMIC_FTRACE
	/* Dynamic tracing modifies the kernel text section */
	start = rodata_start;
#endif
977

978
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
979
	       (end - start) >> 10);
980 981 982 983 984 985
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
986
	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
987

988 989
	rodata_test();

990
#ifdef CONFIG_CPA_DEBUG
991
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
992
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
993

994
	printk(KERN_INFO "Testing CPA: again\n");
995
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
996
#endif
997
}
998

999 1000
#endif

L
Linus Torvalds 已提交
1001 1002 1003
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
1004
	free_init_pages("initrd memory", start, end);
L
Linus Torvalds 已提交
1005 1006 1007
}
#endif

1008 1009
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
				   int flags)
T
Thomas Gleixner 已提交
1010
{
1011
#ifdef CONFIG_NUMA
1012
	int nid, next_nid;
1013
	int ret;
1014 1015
#endif
	unsigned long pfn = phys >> PAGE_SHIFT;
T
Thomas Gleixner 已提交
1016

Y
Yinghai Lu 已提交
1017
	if (pfn >= max_pfn) {
T
Thomas Gleixner 已提交
1018 1019 1020 1021
		/*
		 * This can happen with kdump kernels when accessing
		 * firmware tables:
		 */
1022
		if (pfn < max_pfn_mapped)
1023
			return -EFAULT;
T
Thomas Gleixner 已提交
1024

1025
		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
1026
				phys, len);
1027
		return -EFAULT;
1028 1029 1030 1031
	}

	/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
1032 1033 1034
	nid = phys_to_nid(phys);
	next_nid = phys_to_nid(phys + len - 1);
	if (nid == next_nid)
1035
		ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
1036
	else
1037 1038 1039 1040 1041
		ret = reserve_bootmem(phys, len, flags);

	if (ret != 0)
		return ret;

T
Thomas Gleixner 已提交
1042
#else
1043
	reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
L
Linus Torvalds 已提交
1044
#endif
1045

1046
	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
1047
		dma_reserve += len / PAGE_SIZE;
1048 1049
		set_dma_reserve(dma_reserve);
	}
1050 1051

	return 0;
L
Linus Torvalds 已提交
1052 1053
}

T
Thomas Gleixner 已提交
1054 1055
int kern_addr_valid(unsigned long addr)
{
L
Linus Torvalds 已提交
1056
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
T
Thomas Gleixner 已提交
1057 1058 1059 1060
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
L
Linus Torvalds 已提交
1061 1062

	if (above != 0 && above != -1UL)
T
Thomas Gleixner 已提交
1063 1064
		return 0;

L
Linus Torvalds 已提交
1065 1066 1067 1068 1069 1070
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
T
Thomas Gleixner 已提交
1071
		return 0;
L
Linus Torvalds 已提交
1072 1073 1074 1075

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
T
Thomas Gleixner 已提交
1076

L
Linus Torvalds 已提交
1077 1078 1079 1080 1081 1082
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
T
Thomas Gleixner 已提交
1083

L
Linus Torvalds 已提交
1084 1085 1086
	return pfn_valid(pte_pfn(*pte));
}

T
Thomas Gleixner 已提交
1087 1088 1089 1090 1091
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
L
Linus Torvalds 已提交
1092
static struct vm_area_struct gate_vma = {
T
Thomas Gleixner 已提交
1093 1094 1095 1096
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
L
Linus Torvalds 已提交
1097 1098 1099 1100 1101
};

struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
#ifdef CONFIG_IA32_EMULATION
1102 1103
	if (test_tsk_thread_flag(tsk, TIF_IA32))
		return NULL;
L
Linus Torvalds 已提交
1104 1105 1106 1107 1108 1109 1110
#endif
	return &gate_vma;
}

int in_gate_area(struct task_struct *task, unsigned long addr)
{
	struct vm_area_struct *vma = get_gate_vma(task);
T
Thomas Gleixner 已提交
1111

1112 1113
	if (!vma)
		return 0;
T
Thomas Gleixner 已提交
1114

L
Linus Torvalds 已提交
1115 1116 1117
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

T
Thomas Gleixner 已提交
1118 1119 1120 1121
/*
 * Use this when you have no reliable task/vma, typically from interrupt
 * context. It is less reliable than using the task's vma and may give
 * false positives:
L
Linus Torvalds 已提交
1122 1123 1124
 */
int in_gate_area_no_task(unsigned long addr)
{
1125
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
L
Linus Torvalds 已提交
1126
}
1127

1128 1129 1130 1131 1132 1133 1134 1135
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
1136 1137 1138 1139 1140

#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
1141 1142 1143 1144
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

T
Thomas Gleixner 已提交
1145 1146
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
1147 1148 1149 1150 1151 1152 1153 1154 1155
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
1156
		void *p = NULL;
1157 1158 1159 1160

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
T
Thomas Gleixner 已提交
1161

1162 1163 1164 1165
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

1166 1167 1168 1169 1170 1171 1172 1173
		if (!cpu_has_pse) {
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			pmd = vmemmap_pmd_populate(pud, addr, node);

			if (!pmd)
				return -ENOMEM;

			p = vmemmap_pte_populate(pmd, addr, node);
T
Thomas Gleixner 已提交
1174

1175 1176 1177
			if (!p)
				return -ENOMEM;

1178 1179
			addr_end = addr + PAGE_SIZE;
			p_end = p + PAGE_SIZE;
T
Thomas Gleixner 已提交
1180
		} else {
1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203
			next = pmd_addr_end(addr, end);

			pmd = pmd_offset(pud, addr);
			if (pmd_none(*pmd)) {
				pte_t entry;

				p = vmemmap_alloc_block(PMD_SIZE, node);
				if (!p)
					return -ENOMEM;

				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
						PAGE_KERNEL_LARGE);
				set_pmd(pmd, __pmd(pte_val(entry)));

				/* check to see if we have contiguous blocks */
				if (p_end != p || node_start != node) {
					if (p_start)
						printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						       addr_start, addr_end-1, p_start, p_end-1, node_start);
					addr_start = addr;
					node_start = node;
					p_start = p;
				}
Y
Yinghai Lu 已提交
1204 1205 1206

				addr_end = addr + PMD_SIZE;
				p_end = p + PMD_SIZE;
1207 1208
			} else
				vmemmap_verify((pte_t *)pmd, node, addr, next);
T
Thomas Gleixner 已提交
1209
		}
1210

1211 1212 1213
	}
	return 0;
}
1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224

void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
1225
#endif