pgtable_64.c 22.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
/*
 *  This file contains ioremap and related functions for 64-bit machines.
 *
 *  Derived from arch/ppc64/mm/init.c
 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
 *
 *  Modifications by Paul Mackerras (PowerMac) (paulus@samba.org)
 *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
 *    Copyright (C) 1996 Paul Mackerras
 *
 *  Derived from "arch/i386/mm/init.c"
 *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *
 *  Dave Engebretsen <engebret@us.ibm.com>
 *      Rework for PPC64 port.
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License
 *  as published by the Free Software Foundation; either version
 *  2 of the License, or (at your option) any later version.
 *
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
29
#include <linux/export.h>
30 31 32 33 34 35
#include <linux/types.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/stddef.h>
#include <linux/vmalloc.h>
Y
Yinghai Lu 已提交
36
#include <linux/memblock.h>
37
#include <linux/slab.h>
38
#include <linux/hugetlb.h>
39 40 41 42 43 44 45 46 47 48 49 50 51 52

#include <asm/pgalloc.h>
#include <asm/page.h>
#include <asm/prom.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/smp.h>
#include <asm/machdep.h>
#include <asm/tlb.h>
#include <asm/processor.h>
#include <asm/cputable.h>
#include <asm/sections.h>
53
#include <asm/firmware.h>
54
#include <asm/dma.h>
D
David Gibson 已提交
55 56

#include "mmu_decl.h"
57

58 59 60
#define CREATE_TRACE_POINTS
#include <trace/events/thp.h>

61 62 63 64 65 66
/* Some sanity checking */
#if TASK_SIZE_USER64 > PGTABLE_RANGE
#error TASK_SIZE_USER64 exceeds pagetable range
#endif

#ifdef CONFIG_PPC_STD_MMU_64
67
#if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT))
68 69 70
#error TASK_SIZE_USER64 exceeds user VSID range
#endif
#endif
71

72
unsigned long ioremap_bot = IOREMAP_BASE;
73 74

#ifdef CONFIG_PPC_MMU_NOHASH
75
static __ref void *early_alloc_pgtable(unsigned long size)
76 77 78
{
	void *pt;

79
	pt = __va(memblock_alloc_base(size, size, __pa(MAX_DMA_ADDRESS)));
80 81 82 83 84 85
	memset(pt, 0, size);

	return pt;
}
#endif /* CONFIG_PPC_MMU_NOHASH */

86
/*
87 88
 * map_kernel_page currently only called by __ioremap
 * map_kernel_page adds an entry to the ioremap page table
89 90
 * and adds an entry to the HPT, possibly bolting it
 */
91
int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
92 93 94 95 96 97
{
	pgd_t *pgdp;
	pud_t *pudp;
	pmd_t *pmdp;
	pte_t *ptep;

98
	if (slab_is_available()) {
99 100 101 102 103 104 105
		pgdp = pgd_offset_k(ea);
		pudp = pud_alloc(&init_mm, pgdp, ea);
		if (!pudp)
			return -ENOMEM;
		pmdp = pmd_alloc(&init_mm, pudp, ea);
		if (!pmdp)
			return -ENOMEM;
P
Paul Mackerras 已提交
106
		ptep = pte_alloc_kernel(pmdp, ea);
107 108 109 110 111
		if (!ptep)
			return -ENOMEM;
		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
							  __pgprot(flags)));
	} else {
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
#ifdef CONFIG_PPC_MMU_NOHASH
		pgdp = pgd_offset_k(ea);
#ifdef PUD_TABLE_SIZE
		if (pgd_none(*pgdp)) {
			pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
			BUG_ON(pudp == NULL);
			pgd_populate(&init_mm, pgdp, pudp);
		}
#endif /* PUD_TABLE_SIZE */
		pudp = pud_offset(pgdp, ea);
		if (pud_none(*pudp)) {
			pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
			BUG_ON(pmdp == NULL);
			pud_populate(&init_mm, pudp, pmdp);
		}
		pmdp = pmd_offset(pudp, ea);
		if (!pmd_present(*pmdp)) {
			ptep = early_alloc_pgtable(PAGE_SIZE);
			BUG_ON(ptep == NULL);
			pmd_populate_kernel(&init_mm, pmdp, ptep);
		}
		ptep = pte_offset_kernel(pmdp, ea);
		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
							  __pgprot(flags)));
#else /* CONFIG_PPC_MMU_NOHASH */
137 138 139 140
		/*
		 * If the mm subsystem is not fully up, we cannot create a
		 * linux page table entry for this mapping.  Simply bolt an
		 * entry in the hardware page table.
141
		 *
142
		 */
P
Paul Mackerras 已提交
143 144
		if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
				      mmu_io_psize, mmu_kernel_ssize)) {
145 146 147 148
			printk(KERN_ERR "Failed to do bolted mapping IO "
			       "memory at %016lx !\n", pa);
			return -ENOMEM;
		}
149
#endif /* !CONFIG_PPC_MMU_NOHASH */
150
	}
151 152

	smp_wmb();
153 154 155 156
	return 0;
}


157 158 159 160 161
/**
 * __ioremap_at - Low level function to establish the page tables
 *                for an IO mapping
 */
void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
162 163 164 165
			    unsigned long flags)
{
	unsigned long i;

B
Benjamin Herrenschmidt 已提交
166
	/* Make sure we have the base flags */
167 168 169
	if ((flags & _PAGE_PRESENT) == 0)
		flags |= pgprot_val(PAGE_KERNEL);

B
Benjamin Herrenschmidt 已提交
170 171 172 173
	/* We don't support the 4K PFN hack with ioremap */
	if (flags & _PAGE_4K_PFN)
		return NULL;

174 175 176 177
	WARN_ON(pa & ~PAGE_MASK);
	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
	WARN_ON(size & ~PAGE_MASK);

178
	for (i = 0; i < size; i += PAGE_SIZE)
179
		if (map_kernel_page((unsigned long)ea+i, pa+i, flags))
180 181
			return NULL;

182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
	return (void __iomem *)ea;
}

/**
 * __iounmap_from - Low level function to tear down the page tables
 *                  for an IO mapping. This is used for mappings that
 *                  are manipulated manually, like partial unmapping of
 *                  PCI IOs or ISA space.
 */
void __iounmap_at(void *ea, unsigned long size)
{
	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
	WARN_ON(size & ~PAGE_MASK);

	unmap_kernel_range((unsigned long)ea, size);
197 198
}

199 200
void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size,
				unsigned long flags, void *caller)
201
{
202
	phys_addr_t paligned;
203 204 205 206 207 208 209 210 211 212 213
	void __iomem *ret;

	/*
	 * Choose an address to map it to.
	 * Once the imalloc system is running, we use it.
	 * Before that, we map using addresses going
	 * up from ioremap_bot.  imalloc will use
	 * the addresses from ioremap_bot through
	 * IMALLOC_END
	 * 
	 */
214 215
	paligned = addr & PAGE_MASK;
	size = PAGE_ALIGN(addr + size) - paligned;
216

217
	if ((size == 0) || (paligned == 0))
218 219
		return NULL;

220
	if (slab_is_available()) {
221
		struct vm_struct *area;
222

223 224 225
		area = __get_vm_area_caller(size, VM_IOREMAP,
					    ioremap_bot, IOREMAP_END,
					    caller);
226 227
		if (area == NULL)
			return NULL;
228 229

		area->phys_addr = paligned;
230
		ret = __ioremap_at(paligned, area->addr, size, flags);
231
		if (!ret)
232
			vunmap(area->addr);
233
	} else {
234
		ret = __ioremap_at(paligned, (void *)ioremap_bot, size, flags);
235 236 237
		if (ret)
			ioremap_bot += size;
	}
238 239 240

	if (ret)
		ret += addr & ~PAGE_MASK;
241 242 243
	return ret;
}

244 245 246 247 248
void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
			 unsigned long flags)
{
	return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
}
249

250
void __iomem * ioremap(phys_addr_t addr, unsigned long size)
251
{
252
	unsigned long flags = pgprot_val(pgprot_noncached(__pgprot(0)));
253
	void *caller = __builtin_return_address(0);
254 255

	if (ppc_md.ioremap)
256 257
		return ppc_md.ioremap(addr, size, flags, caller);
	return __ioremap_caller(addr, size, flags, caller);
258 259
}

A
Anton Blanchard 已提交
260 261
void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size)
{
262
	unsigned long flags = pgprot_val(pgprot_noncached_wc(__pgprot(0)));
A
Anton Blanchard 已提交
263 264 265 266 267 268 269
	void *caller = __builtin_return_address(0);

	if (ppc_md.ioremap)
		return ppc_md.ioremap(addr, size, flags, caller);
	return __ioremap_caller(addr, size, flags, caller);
}

A
Anton Blanchard 已提交
270
void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size,
271 272
			     unsigned long flags)
{
273 274
	void *caller = __builtin_return_address(0);

B
Benjamin Herrenschmidt 已提交
275
	/* writeable implies dirty for kernel addresses */
276
	if (flags & _PAGE_WRITE)
B
Benjamin Herrenschmidt 已提交
277 278
		flags |= _PAGE_DIRTY;

279 280 281 282 283 284 285 286 287 288 289
	/* we don't want to let _PAGE_EXEC leak out */
	flags &= ~_PAGE_EXEC;
	/*
	 * Force kernel mapping.
	 */
#if defined(CONFIG_PPC_BOOK3S_64)
	flags |= _PAGE_PRIVILEGED;
#else
	flags &= ~_PAGE_USER;
#endif

B
Benjamin Herrenschmidt 已提交
290

291 292 293 294 295 296 297 298
#ifdef _PAGE_BAP_SR
	/* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format
	 * which means that we just cleared supervisor access... oops ;-) This
	 * restores it
	 */
	flags |= _PAGE_BAP_SR;
#endif

299
	if (ppc_md.ioremap)
300 301
		return ppc_md.ioremap(addr, size, flags, caller);
	return __ioremap_caller(addr, size, flags, caller);
302 303 304
}


305 306 307 308
/*  
 * Unmap an IO region and remove it from imalloc'd list.
 * Access to IO memory should be serialized by driver.
 */
309
void __iounmap(volatile void __iomem *token)
310 311 312
{
	void *addr;

313
	if (!slab_is_available())
314 315
		return;
	
316 317 318 319 320 321 322 323
	addr = (void *) ((unsigned long __force)
			 PCI_FIX_ADDR(token) & PAGE_MASK);
	if ((unsigned long)addr < ioremap_bot) {
		printk(KERN_WARNING "Attempt to iounmap early bolted mapping"
		       " at 0x%p\n", addr);
		return;
	}
	vunmap(addr);
324 325
}

326
void iounmap(volatile void __iomem *token)
327 328 329 330 331 332 333
{
	if (ppc_md.iounmap)
		ppc_md.iounmap(token);
	else
		__iounmap(token);
}

334
EXPORT_SYMBOL(ioremap);
A
Anton Blanchard 已提交
335
EXPORT_SYMBOL(ioremap_wc);
A
Anton Blanchard 已提交
336
EXPORT_SYMBOL(ioremap_prot);
337
EXPORT_SYMBOL(__ioremap);
338
EXPORT_SYMBOL(__ioremap_at);
339
EXPORT_SYMBOL(iounmap);
340
EXPORT_SYMBOL(__iounmap);
341
EXPORT_SYMBOL(__iounmap_at);
342

343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359
#ifndef __PAGETABLE_PUD_FOLDED
/* 4 level page table */
struct page *pgd_page(pgd_t pgd)
{
	if (pgd_huge(pgd))
		return pte_page(pgd_pte(pgd));
	return virt_to_page(pgd_page_vaddr(pgd));
}
#endif

struct page *pud_page(pud_t pud)
{
	if (pud_huge(pud))
		return pte_page(pud_pte(pud));
	return virt_to_page(pud_page_vaddr(pud));
}

360 361 362 363 364 365
/*
 * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags
 * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
 */
struct page *pmd_page(pmd_t pmd)
{
366
	if (pmd_trans_huge(pmd) || pmd_huge(pmd))
367
		return pte_page(pmd_pte(pmd));
368 369 370
	return virt_to_page(pmd_page_vaddr(pmd));
}

371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397
#ifdef CONFIG_PPC_64K_PAGES
static pte_t *get_from_cache(struct mm_struct *mm)
{
	void *pte_frag, *ret;

	spin_lock(&mm->page_table_lock);
	ret = mm->context.pte_frag;
	if (ret) {
		pte_frag = ret + PTE_FRAG_SIZE;
		/*
		 * If we have taken up all the fragments mark PTE page NULL
		 */
		if (((unsigned long)pte_frag & ~PAGE_MASK) == 0)
			pte_frag = NULL;
		mm->context.pte_frag = pte_frag;
	}
	spin_unlock(&mm->page_table_lock);
	return (pte_t *)ret;
}

static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
{
	void *ret = NULL;
	struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
				       __GFP_REPEAT | __GFP_ZERO);
	if (!page)
		return NULL;
398 399 400 401
	if (!kernel && !pgtable_page_ctor(page)) {
		__free_page(page);
		return NULL;
	}
402 403 404 405 406 407 408 409 410

	ret = page_address(page);
	spin_lock(&mm->page_table_lock);
	/*
	 * If we find pgtable_page set, we return
	 * the allocated page with single fragement
	 * count.
	 */
	if (likely(!mm->context.pte_frag)) {
411
		set_page_count(page, PTE_FRAG_NR);
412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488
		mm->context.pte_frag = ret + PTE_FRAG_SIZE;
	}
	spin_unlock(&mm->page_table_lock);

	return (pte_t *)ret;
}

pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
{
	pte_t *pte;

	pte = get_from_cache(mm);
	if (pte)
		return pte;

	return __alloc_for_cache(mm, kernel);
}

void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel)
{
	struct page *page = virt_to_page(table);
	if (put_page_testzero(page)) {
		if (!kernel)
			pgtable_page_dtor(page);
		free_hot_cold_page(page, 0);
	}
}

#ifdef CONFIG_SMP
static void page_table_free_rcu(void *table)
{
	struct page *page = virt_to_page(table);
	if (put_page_testzero(page)) {
		pgtable_page_dtor(page);
		free_hot_cold_page(page, 0);
	}
}

void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
{
	unsigned long pgf = (unsigned long)table;

	BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
	pgf |= shift;
	tlb_remove_table(tlb, (void *)pgf);
}

void __tlb_remove_table(void *_table)
{
	void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
	unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;

	if (!shift)
		/* PTE page needs special handling */
		page_table_free_rcu(table);
	else {
		BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
		kmem_cache_free(PGT_CACHE(shift), table);
	}
}
#else
void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
{
	if (!shift) {
		/* PTE page needs special handling */
		struct page *page = virt_to_page(table);
		if (put_page_testzero(page)) {
			pgtable_page_dtor(page);
			free_hot_cold_page(page, 0);
		}
	} else {
		BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
		kmem_cache_free(PGT_CACHE(shift), table);
	}
}
#endif
#endif /* CONFIG_PPC_64K_PAGES */
489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

/*
 * This is called when relaxing access to a hugepage. It's also called in the page
 * fault path when we don't hit any of the major fault cases, ie, a minor
 * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
 * handled those two for us, we additionally deal with missing execute
 * permission here on some processors
 */
int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
			  pmd_t *pmdp, pmd_t entry, int dirty)
{
	int changed;
#ifdef CONFIG_DEBUG_VM
	WARN_ON(!pmd_trans_huge(*pmdp));
	assert_spin_locked(&vma->vm_mm->page_table_lock);
#endif
	changed = !pmd_same(*(pmdp), entry);
	if (changed) {
		__ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
		/*
		 * Since we are not supporting SW TLB systems, we don't
		 * have any thing similar to flush_tlb_page_nohash()
		 */
	}
	return changed;
}

unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
519 520
				  pmd_t *pmdp, unsigned long clr,
				  unsigned long set)
521 522
{

523 524
	__be64 old_be, tmp;
	unsigned long old;
525 526 527 528 529 530 531 532

#ifdef CONFIG_DEBUG_VM
	WARN_ON(!pmd_trans_huge(*pmdp));
	assert_spin_locked(&mm->page_table_lock);
#endif

	__asm__ __volatile__(
	"1:	ldarx	%0,0,%3\n\
533
		and.	%1,%0,%6\n\
534 535
		bne-	1b \n\
		andc	%1,%0,%4 \n\
536
		or	%1,%1,%7\n\
537 538
		stdcx.	%1,0,%3 \n\
		bne-	1b"
539 540 541
	: "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
	: "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
	  "r" (cpu_to_be64(_PAGE_BUSY)), "r" (cpu_to_be64(set))
542
	: "cc" );
543

544 545
	old = be64_to_cpu(old_be);

546
	trace_hugepage_update(addr, old, clr, set);
547
	if (old & _PAGE_HASHPTE)
548
		hpte_do_hugepage_flush(mm, addr, pmdp, old);
549 550 551
	return old;
}

552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587
pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
			  pmd_t *pmdp)
{
	pmd_t pmd;

	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
	VM_BUG_ON(pmd_trans_huge(*pmdp));

	pmd = *pmdp;
	pmd_clear(pmdp);
	/*
	 * Wait for all pending hash_page to finish. This is needed
	 * in case of subpage collapse. When we collapse normal pages
	 * to hugepage, we first clear the pmd, then invalidate all
	 * the PTE entries. The assumption here is that any low level
	 * page fault will see a none pmd and take the slow path that
	 * will wait on mmap_sem. But we could very well be in a
	 * hash_page with local ptep pointer value. Such a hash page
	 * can result in adding new HPTE entries for normal subpages.
	 * That means we could be modifying the page content as we
	 * copy them to a huge page. So wait for parallel hash_page
	 * to finish before invalidating HPTE entries. We can do this
	 * by sending an IPI to all the cpus and executing a dummy
	 * function there.
	 */
	kick_all_cpus_sync();
	/*
	 * Now invalidate the hpte entries in the range
	 * covered by pmd. This make sure we take a
	 * fault and will find the pmd as none, which will
	 * result in a major fault which takes mmap_sem and
	 * hence wait for collapse to complete. Without this
	 * the __collapse_huge_page_copy can result in copying
	 * the old content.
	 */
	flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653
	return pmd;
}

int pmdp_test_and_clear_young(struct vm_area_struct *vma,
			      unsigned long address, pmd_t *pmdp)
{
	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
}

/*
 * We currently remove entries from the hashtable regardless of whether
 * the entry was young or dirty. The generic routines only flush if the
 * entry was young or dirty which is not good enough.
 *
 * We should be more intelligent about this but for the moment we override
 * these functions and force a tlb flush unconditionally
 */
int pmdp_clear_flush_young(struct vm_area_struct *vma,
				  unsigned long address, pmd_t *pmdp)
{
	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
}

/*
 * We want to put the pgtable in pmd and use pgtable for tracking
 * the base page size hptes
 */
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
				pgtable_t pgtable)
{
	pgtable_t *pgtable_slot;
	assert_spin_locked(&mm->page_table_lock);
	/*
	 * we store the pgtable in the second half of PMD
	 */
	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
	*pgtable_slot = pgtable;
	/*
	 * expose the deposited pgtable to other cpus.
	 * before we set the hugepage PTE at pmd level
	 * hash fault code looks at the deposted pgtable
	 * to store hash index values.
	 */
	smp_wmb();
}

pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
{
	pgtable_t pgtable;
	pgtable_t *pgtable_slot;

	assert_spin_locked(&mm->page_table_lock);
	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
	pgtable = *pgtable_slot;
	/*
	 * Once we withdraw, mark the entry NULL.
	 */
	*pgtable_slot = NULL;
	/*
	 * We store HPTE information in the deposited PTE fragment.
	 * zero out the content on withdraw.
	 */
	memset(pgtable, 0, PTE_FRAG_SIZE);
	return pgtable;
}

654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671
void pmdp_huge_split_prepare(struct vm_area_struct *vma,
			     unsigned long address, pmd_t *pmdp)
{
	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
	VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);

	/*
	 * We can't mark the pmd none here, because that will cause a race
	 * against exit_mmap. We need to continue mark pmd TRANS HUGE, while
	 * we spilt, but at the same time we wan't rest of the ppc64 code
	 * not to insert hash pte on this, because we will be modifying
	 * the deposited pgtable in the caller of this function. Hence
	 * clear the _PAGE_USER so that we move the fault handling to
	 * higher level function and that will serialize against ptl.
	 * We need to flush existing hash pte entries here even though,
	 * the translation is still valid, because we will withdraw
	 * pgtable_t after this.
	 */
672
	pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED);
673 674 675
}


676 677 678 679 680 681 682 683
/*
 * set a new huge pmd. We should not be called for updating
 * an existing pmd entry. That should go via pmd_hugepage_update.
 */
void set_pmd_at(struct mm_struct *mm, unsigned long addr,
		pmd_t *pmdp, pmd_t pmd)
{
#ifdef CONFIG_DEBUG_VM
684
	WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
685 686 687
	assert_spin_locked(&mm->page_table_lock);
	WARN_ON(!pmd_trans_huge(pmd));
#endif
688
	trace_hugepage_set_pmd(addr, pmd_val(pmd));
689 690 691
	return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
}

692 693 694 695
/*
 * We use this to invalidate a pmdp entry before switching from a
 * hugepte to regular pmd entry.
 */
696 697 698
void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
		     pmd_t *pmdp)
{
699
	pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
700 701 702 703 704 705

	/*
	 * This ensures that generic code that rely on IRQ disabling
	 * to prevent a parallel THP split work as expected.
	 */
	kick_all_cpus_sync();
706 707 708 709 710 711 712
}

/*
 * A linux hugepage PMD was changed and the corresponding hash table entries
 * neesd to be flushed.
 */
void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
713
			    pmd_t *pmdp, unsigned long old_pmd)
714
{
715
	int ssize;
716 717
	unsigned int psize;
	unsigned long vsid;
718
	unsigned long flags = 0;
719
	const struct cpumask *tmp;
720

721
	/* get the base page size,vsid and segment size */
722
#ifdef CONFIG_DEBUG_VM
723
	psize = get_slice_psize(mm, addr);
724 725 726 727 728 729 730
	BUG_ON(psize == MMU_PAGE_16M);
#endif
	if (old_pmd & _PAGE_COMBO)
		psize = MMU_PAGE_4K;
	else
		psize = MMU_PAGE_64K;

731 732 733
	if (!is_kernel_addr(addr)) {
		ssize = user_segment_size(addr);
		vsid = get_vsid(mm->context.id, addr, ssize);
734 735
		WARN_ON(vsid == 0);
	} else {
736
		vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
737 738
		ssize = mmu_kernel_ssize;
	}
739

740 741
	tmp = cpumask_of(smp_processor_id());
	if (cpumask_equal(mm_cpumask(mm), tmp))
742
		flags |= HPTE_LOCAL_UPDATE;
743

744
	return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
745 746 747 748
}

static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
{
749
	return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
750 751 752 753
}

pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
{
754
	unsigned long pmdv;
A
Aneesh Kumar K.V 已提交
755

756
	pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
757
	return pmd_set_protbits(__pmd(pmdv), pgprot);
758 759 760 761 762 763 764 765 766
}

pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
{
	return pfn_pmd(page_to_pfn(page), pgprot);
}

pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
767
	unsigned long pmdv;
768

769 770 771
	pmdv = pmd_val(pmd);
	pmdv &= _HPAGE_CHG_MASK;
	return pmd_set_protbits(__pmd(pmdv), newprot);
772 773 774 775 776 777 778 779 780 781 782 783 784 785
}

/*
 * This is called at the end of handling a user page fault, when the
 * fault has been handled by updating a HUGE PMD entry in the linux page tables.
 * We use it to preload an HPTE into the hash table corresponding to
 * the updated linux HUGE PMD entry.
 */
void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
			  pmd_t *pmd)
{
	return;
}

786 787
pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
			      unsigned long addr, pmd_t *pmdp)
788 789 790 791 792 793
{
	pmd_t old_pmd;
	pgtable_t pgtable;
	unsigned long old;
	pgtable_t *pgtable_slot;

794
	old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
795 796 797 798 799 800 801 802 803 804 805 806 807
	old_pmd = __pmd(old);
	/*
	 * We have pmd == none and we are holding page_table_lock.
	 * So we can safely go and clear the pgtable hash
	 * index info.
	 */
	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
	pgtable = *pgtable_slot;
	/*
	 * Let's zero out old valid and hash index details
	 * hash fault look at them.
	 */
	memset(pgtable, 0, PTE_FRAG_SIZE);
808 809 810 811 812 813 814 815 816 817 818
	/*
	 * Serialize against find_linux_pte_or_hugepte which does lock-less
	 * lookup in page tables with local interrupts disabled. For huge pages
	 * it casts pmd_t to pte_t. Since format of pte_t is different from
	 * pmd_t we want to prevent transit from pmd pointing to page table
	 * to pmd pointing to huge page (and back) while interrupts are disabled.
	 * We clear pmd to possibly replace it with page table pointer in
	 * different code paths. So make sure we wait for the parallel
	 * find_linux_pte_or_hugepage to finish.
	 */
	kick_all_cpus_sync();
819 820
	return old_pmd;
}
821 822 823

int has_transparent_hugepage(void)
{
824 825 826 827 828 829 830

	BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) >= MAX_ORDER,
		"hugepages can't be allocated by the buddy allocator");

	BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) < 2,
			 "We need more than 2 pages to do deferred thp split");

831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856
	if (!mmu_has_feature(MMU_FTR_16M_PAGE))
		return 0;
	/*
	 * We support THP only if PMD_SIZE is 16MB.
	 */
	if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
		return 0;
	/*
	 * We need to make sure that we support 16MB hugepage in a segement
	 * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
	 * of 64K.
	 */
	/*
	 * If we have 64K HPTE, we will be using that by default
	 */
	if (mmu_psize_defs[MMU_PAGE_64K].shift &&
	    (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
		return 0;
	/*
	 * Ok we only have 4K HPTE
	 */
	if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
		return 0;

	return 1;
}
857
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */