pgtable_64.c 21.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
/*
 *  This file contains ioremap and related functions for 64-bit machines.
 *
 *  Derived from arch/ppc64/mm/init.c
 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
 *
 *  Modifications by Paul Mackerras (PowerMac) (paulus@samba.org)
 *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
 *    Copyright (C) 1996 Paul Mackerras
 *
 *  Derived from "arch/i386/mm/init.c"
 *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *
 *  Dave Engebretsen <engebret@us.ibm.com>
 *      Rework for PPC64 port.
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License
 *  as published by the Free Software Foundation; either version
 *  2 of the License, or (at your option) any later version.
 *
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
29
#include <linux/export.h>
30 31 32 33 34 35
#include <linux/types.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/stddef.h>
#include <linux/vmalloc.h>
Y
Yinghai Lu 已提交
36
#include <linux/memblock.h>
37
#include <linux/slab.h>
38
#include <linux/hugetlb.h>
39 40 41 42 43 44 45 46 47 48 49 50 51 52

#include <asm/pgalloc.h>
#include <asm/page.h>
#include <asm/prom.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/smp.h>
#include <asm/machdep.h>
#include <asm/tlb.h>
#include <asm/processor.h>
#include <asm/cputable.h>
#include <asm/sections.h>
53
#include <asm/firmware.h>
54
#include <asm/dma.h>
D
David Gibson 已提交
55 56

#include "mmu_decl.h"
57

58 59 60
#define CREATE_TRACE_POINTS
#include <trace/events/thp.h>

61 62 63 64 65 66
/* Some sanity checking */
#if TASK_SIZE_USER64 > PGTABLE_RANGE
#error TASK_SIZE_USER64 exceeds pagetable range
#endif

#ifdef CONFIG_PPC_STD_MMU_64
67
#if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT))
68 69 70
#error TASK_SIZE_USER64 exceeds user VSID range
#endif
#endif
71

72 73 74 75 76 77 78
#ifdef CONFIG_PPC_BOOK3S_64
/*
 * partition table and process table for ISA 3.0
 */
struct prtb_entry *process_tb;
struct patb_entry *partition_tb;
#endif
79
unsigned long ioremap_bot = IOREMAP_BASE;
80 81

#ifdef CONFIG_PPC_MMU_NOHASH
82
static __ref void *early_alloc_pgtable(unsigned long size)
83 84 85
{
	void *pt;

86
	pt = __va(memblock_alloc_base(size, size, __pa(MAX_DMA_ADDRESS)));
87 88 89 90 91 92
	memset(pt, 0, size);

	return pt;
}
#endif /* CONFIG_PPC_MMU_NOHASH */

93
/*
94 95
 * map_kernel_page currently only called by __ioremap
 * map_kernel_page adds an entry to the ioremap page table
96 97
 * and adds an entry to the HPT, possibly bolting it
 */
98
int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
99 100 101 102 103 104
{
	pgd_t *pgdp;
	pud_t *pudp;
	pmd_t *pmdp;
	pte_t *ptep;

105
	if (slab_is_available()) {
106 107 108 109 110 111 112
		pgdp = pgd_offset_k(ea);
		pudp = pud_alloc(&init_mm, pgdp, ea);
		if (!pudp)
			return -ENOMEM;
		pmdp = pmd_alloc(&init_mm, pudp, ea);
		if (!pmdp)
			return -ENOMEM;
P
Paul Mackerras 已提交
113
		ptep = pte_alloc_kernel(pmdp, ea);
114 115 116 117 118
		if (!ptep)
			return -ENOMEM;
		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
							  __pgprot(flags)));
	} else {
119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
#ifdef CONFIG_PPC_MMU_NOHASH
		pgdp = pgd_offset_k(ea);
#ifdef PUD_TABLE_SIZE
		if (pgd_none(*pgdp)) {
			pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
			BUG_ON(pudp == NULL);
			pgd_populate(&init_mm, pgdp, pudp);
		}
#endif /* PUD_TABLE_SIZE */
		pudp = pud_offset(pgdp, ea);
		if (pud_none(*pudp)) {
			pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
			BUG_ON(pmdp == NULL);
			pud_populate(&init_mm, pudp, pmdp);
		}
		pmdp = pmd_offset(pudp, ea);
		if (!pmd_present(*pmdp)) {
			ptep = early_alloc_pgtable(PAGE_SIZE);
			BUG_ON(ptep == NULL);
			pmd_populate_kernel(&init_mm, pmdp, ptep);
		}
		ptep = pte_offset_kernel(pmdp, ea);
		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
							  __pgprot(flags)));
#else /* CONFIG_PPC_MMU_NOHASH */
144 145 146 147
		/*
		 * If the mm subsystem is not fully up, we cannot create a
		 * linux page table entry for this mapping.  Simply bolt an
		 * entry in the hardware page table.
148
		 *
149
		 */
P
Paul Mackerras 已提交
150 151
		if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
				      mmu_io_psize, mmu_kernel_ssize)) {
152 153 154 155
			printk(KERN_ERR "Failed to do bolted mapping IO "
			       "memory at %016lx !\n", pa);
			return -ENOMEM;
		}
156
#endif /* !CONFIG_PPC_MMU_NOHASH */
157
	}
158 159

	smp_wmb();
160 161 162 163
	return 0;
}


164 165 166 167 168
/**
 * __ioremap_at - Low level function to establish the page tables
 *                for an IO mapping
 */
void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
169 170 171 172
			    unsigned long flags)
{
	unsigned long i;

B
Benjamin Herrenschmidt 已提交
173
	/* Make sure we have the base flags */
174 175 176
	if ((flags & _PAGE_PRESENT) == 0)
		flags |= pgprot_val(PAGE_KERNEL);

B
Benjamin Herrenschmidt 已提交
177 178 179 180
	/* We don't support the 4K PFN hack with ioremap */
	if (flags & _PAGE_4K_PFN)
		return NULL;

181 182 183 184
	WARN_ON(pa & ~PAGE_MASK);
	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
	WARN_ON(size & ~PAGE_MASK);

185
	for (i = 0; i < size; i += PAGE_SIZE)
186
		if (map_kernel_page((unsigned long)ea+i, pa+i, flags))
187 188
			return NULL;

189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
	return (void __iomem *)ea;
}

/**
 * __iounmap_from - Low level function to tear down the page tables
 *                  for an IO mapping. This is used for mappings that
 *                  are manipulated manually, like partial unmapping of
 *                  PCI IOs or ISA space.
 */
void __iounmap_at(void *ea, unsigned long size)
{
	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
	WARN_ON(size & ~PAGE_MASK);

	unmap_kernel_range((unsigned long)ea, size);
204 205
}

206 207
void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size,
				unsigned long flags, void *caller)
208
{
209
	phys_addr_t paligned;
210 211 212 213 214 215 216 217 218 219 220
	void __iomem *ret;

	/*
	 * Choose an address to map it to.
	 * Once the imalloc system is running, we use it.
	 * Before that, we map using addresses going
	 * up from ioremap_bot.  imalloc will use
	 * the addresses from ioremap_bot through
	 * IMALLOC_END
	 * 
	 */
221 222
	paligned = addr & PAGE_MASK;
	size = PAGE_ALIGN(addr + size) - paligned;
223

224
	if ((size == 0) || (paligned == 0))
225 226
		return NULL;

227
	if (slab_is_available()) {
228
		struct vm_struct *area;
229

230 231 232
		area = __get_vm_area_caller(size, VM_IOREMAP,
					    ioremap_bot, IOREMAP_END,
					    caller);
233 234
		if (area == NULL)
			return NULL;
235 236

		area->phys_addr = paligned;
237
		ret = __ioremap_at(paligned, area->addr, size, flags);
238
		if (!ret)
239
			vunmap(area->addr);
240
	} else {
241
		ret = __ioremap_at(paligned, (void *)ioremap_bot, size, flags);
242 243 244
		if (ret)
			ioremap_bot += size;
	}
245 246 247

	if (ret)
		ret += addr & ~PAGE_MASK;
248 249 250
	return ret;
}

251 252 253 254 255
void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
			 unsigned long flags)
{
	return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
}
256

257
void __iomem * ioremap(phys_addr_t addr, unsigned long size)
258
{
259
	unsigned long flags = pgprot_val(pgprot_noncached(__pgprot(0)));
260
	void *caller = __builtin_return_address(0);
261 262

	if (ppc_md.ioremap)
263 264
		return ppc_md.ioremap(addr, size, flags, caller);
	return __ioremap_caller(addr, size, flags, caller);
265 266
}

A
Anton Blanchard 已提交
267 268
void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size)
{
269
	unsigned long flags = pgprot_val(pgprot_noncached_wc(__pgprot(0)));
A
Anton Blanchard 已提交
270 271 272 273 274 275 276
	void *caller = __builtin_return_address(0);

	if (ppc_md.ioremap)
		return ppc_md.ioremap(addr, size, flags, caller);
	return __ioremap_caller(addr, size, flags, caller);
}

A
Anton Blanchard 已提交
277
void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size,
278 279
			     unsigned long flags)
{
280 281
	void *caller = __builtin_return_address(0);

B
Benjamin Herrenschmidt 已提交
282
	/* writeable implies dirty for kernel addresses */
283
	if (flags & _PAGE_WRITE)
B
Benjamin Herrenschmidt 已提交
284 285
		flags |= _PAGE_DIRTY;

286 287 288 289 290 291 292 293 294 295 296
	/* we don't want to let _PAGE_EXEC leak out */
	flags &= ~_PAGE_EXEC;
	/*
	 * Force kernel mapping.
	 */
#if defined(CONFIG_PPC_BOOK3S_64)
	flags |= _PAGE_PRIVILEGED;
#else
	flags &= ~_PAGE_USER;
#endif

B
Benjamin Herrenschmidt 已提交
297

298 299 300 301 302 303 304 305
#ifdef _PAGE_BAP_SR
	/* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format
	 * which means that we just cleared supervisor access... oops ;-) This
	 * restores it
	 */
	flags |= _PAGE_BAP_SR;
#endif

306
	if (ppc_md.ioremap)
307 308
		return ppc_md.ioremap(addr, size, flags, caller);
	return __ioremap_caller(addr, size, flags, caller);
309 310 311
}


312 313 314 315
/*  
 * Unmap an IO region and remove it from imalloc'd list.
 * Access to IO memory should be serialized by driver.
 */
316
void __iounmap(volatile void __iomem *token)
317 318 319
{
	void *addr;

320
	if (!slab_is_available())
321 322
		return;
	
323 324 325 326 327 328 329 330
	addr = (void *) ((unsigned long __force)
			 PCI_FIX_ADDR(token) & PAGE_MASK);
	if ((unsigned long)addr < ioremap_bot) {
		printk(KERN_WARNING "Attempt to iounmap early bolted mapping"
		       " at 0x%p\n", addr);
		return;
	}
	vunmap(addr);
331 332
}

333
void iounmap(volatile void __iomem *token)
334 335 336 337 338 339 340
{
	if (ppc_md.iounmap)
		ppc_md.iounmap(token);
	else
		__iounmap(token);
}

341
EXPORT_SYMBOL(ioremap);
A
Anton Blanchard 已提交
342
EXPORT_SYMBOL(ioremap_wc);
A
Anton Blanchard 已提交
343
EXPORT_SYMBOL(ioremap_prot);
344
EXPORT_SYMBOL(__ioremap);
345
EXPORT_SYMBOL(__ioremap_at);
346
EXPORT_SYMBOL(iounmap);
347
EXPORT_SYMBOL(__iounmap);
348
EXPORT_SYMBOL(__iounmap_at);
349

350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366
#ifndef __PAGETABLE_PUD_FOLDED
/* 4 level page table */
struct page *pgd_page(pgd_t pgd)
{
	if (pgd_huge(pgd))
		return pte_page(pgd_pte(pgd));
	return virt_to_page(pgd_page_vaddr(pgd));
}
#endif

struct page *pud_page(pud_t pud)
{
	if (pud_huge(pud))
		return pte_page(pud_pte(pud));
	return virt_to_page(pud_page_vaddr(pud));
}

367 368 369 370 371 372
/*
 * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags
 * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
 */
struct page *pmd_page(pmd_t pmd)
{
373
	if (pmd_trans_huge(pmd) || pmd_huge(pmd))
374
		return pte_page(pmd_pte(pmd));
375 376 377
	return virt_to_page(pmd_page_vaddr(pmd));
}

378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404
#ifdef CONFIG_PPC_64K_PAGES
static pte_t *get_from_cache(struct mm_struct *mm)
{
	void *pte_frag, *ret;

	spin_lock(&mm->page_table_lock);
	ret = mm->context.pte_frag;
	if (ret) {
		pte_frag = ret + PTE_FRAG_SIZE;
		/*
		 * If we have taken up all the fragments mark PTE page NULL
		 */
		if (((unsigned long)pte_frag & ~PAGE_MASK) == 0)
			pte_frag = NULL;
		mm->context.pte_frag = pte_frag;
	}
	spin_unlock(&mm->page_table_lock);
	return (pte_t *)ret;
}

static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
{
	void *ret = NULL;
	struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
				       __GFP_REPEAT | __GFP_ZERO);
	if (!page)
		return NULL;
405 406 407 408
	if (!kernel && !pgtable_page_ctor(page)) {
		__free_page(page);
		return NULL;
	}
409 410 411 412 413 414 415 416 417

	ret = page_address(page);
	spin_lock(&mm->page_table_lock);
	/*
	 * If we find pgtable_page set, we return
	 * the allocated page with single fragement
	 * count.
	 */
	if (likely(!mm->context.pte_frag)) {
418
		set_page_count(page, PTE_FRAG_NR);
419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495
		mm->context.pte_frag = ret + PTE_FRAG_SIZE;
	}
	spin_unlock(&mm->page_table_lock);

	return (pte_t *)ret;
}

pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
{
	pte_t *pte;

	pte = get_from_cache(mm);
	if (pte)
		return pte;

	return __alloc_for_cache(mm, kernel);
}

void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel)
{
	struct page *page = virt_to_page(table);
	if (put_page_testzero(page)) {
		if (!kernel)
			pgtable_page_dtor(page);
		free_hot_cold_page(page, 0);
	}
}

#ifdef CONFIG_SMP
static void page_table_free_rcu(void *table)
{
	struct page *page = virt_to_page(table);
	if (put_page_testzero(page)) {
		pgtable_page_dtor(page);
		free_hot_cold_page(page, 0);
	}
}

void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
{
	unsigned long pgf = (unsigned long)table;

	BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
	pgf |= shift;
	tlb_remove_table(tlb, (void *)pgf);
}

void __tlb_remove_table(void *_table)
{
	void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
	unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;

	if (!shift)
		/* PTE page needs special handling */
		page_table_free_rcu(table);
	else {
		BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
		kmem_cache_free(PGT_CACHE(shift), table);
	}
}
#else
void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
{
	if (!shift) {
		/* PTE page needs special handling */
		struct page *page = virt_to_page(table);
		if (put_page_testzero(page)) {
			pgtable_page_dtor(page);
			free_hot_cold_page(page, 0);
		}
	} else {
		BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
		kmem_cache_free(PGT_CACHE(shift), table);
	}
}
#endif
#endif /* CONFIG_PPC_64K_PAGES */
496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

/*
 * This is called when relaxing access to a hugepage. It's also called in the page
 * fault path when we don't hit any of the major fault cases, ie, a minor
 * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
 * handled those two for us, we additionally deal with missing execute
 * permission here on some processors
 */
int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
			  pmd_t *pmdp, pmd_t entry, int dirty)
{
	int changed;
#ifdef CONFIG_DEBUG_VM
	WARN_ON(!pmd_trans_huge(*pmdp));
	assert_spin_locked(&vma->vm_mm->page_table_lock);
#endif
	changed = !pmd_same(*(pmdp), entry);
	if (changed) {
		__ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
		/*
		 * Since we are not supporting SW TLB systems, we don't
		 * have any thing similar to flush_tlb_page_nohash()
		 */
	}
	return changed;
}

unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
526 527
				  pmd_t *pmdp, unsigned long clr,
				  unsigned long set)
528 529
{

530 531
	__be64 old_be, tmp;
	unsigned long old;
532 533 534 535 536 537 538 539

#ifdef CONFIG_DEBUG_VM
	WARN_ON(!pmd_trans_huge(*pmdp));
	assert_spin_locked(&mm->page_table_lock);
#endif

	__asm__ __volatile__(
	"1:	ldarx	%0,0,%3\n\
540
		and.	%1,%0,%6\n\
541 542
		bne-	1b \n\
		andc	%1,%0,%4 \n\
543
		or	%1,%1,%7\n\
544 545
		stdcx.	%1,0,%3 \n\
		bne-	1b"
546 547 548
	: "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
	: "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
	  "r" (cpu_to_be64(_PAGE_BUSY)), "r" (cpu_to_be64(set))
549
	: "cc" );
550

551 552
	old = be64_to_cpu(old_be);

553
	trace_hugepage_update(addr, old, clr, set);
554
	if (old & _PAGE_HASHPTE)
555
		hpte_do_hugepage_flush(mm, addr, pmdp, old);
556 557 558
	return old;
}

559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594
pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
			  pmd_t *pmdp)
{
	pmd_t pmd;

	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
	VM_BUG_ON(pmd_trans_huge(*pmdp));

	pmd = *pmdp;
	pmd_clear(pmdp);
	/*
	 * Wait for all pending hash_page to finish. This is needed
	 * in case of subpage collapse. When we collapse normal pages
	 * to hugepage, we first clear the pmd, then invalidate all
	 * the PTE entries. The assumption here is that any low level
	 * page fault will see a none pmd and take the slow path that
	 * will wait on mmap_sem. But we could very well be in a
	 * hash_page with local ptep pointer value. Such a hash page
	 * can result in adding new HPTE entries for normal subpages.
	 * That means we could be modifying the page content as we
	 * copy them to a huge page. So wait for parallel hash_page
	 * to finish before invalidating HPTE entries. We can do this
	 * by sending an IPI to all the cpus and executing a dummy
	 * function there.
	 */
	kick_all_cpus_sync();
	/*
	 * Now invalidate the hpte entries in the range
	 * covered by pmd. This make sure we take a
	 * fault and will find the pmd as none, which will
	 * result in a major fault which takes mmap_sem and
	 * hence wait for collapse to complete. Without this
	 * the __collapse_huge_page_copy can result in copying
	 * the old content.
	 */
	flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
595 596 597 598 599
	return pmd;
}

/*
 * We currently remove entries from the hashtable regardless of whether
600
 * the entry was young or dirty.
601 602 603 604
 *
 * We should be more intelligent about this but for the moment we override
 * these functions and force a tlb flush unconditionally
 */
605 606
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
			      unsigned long address, pmd_t *pmdp)
607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653
{
	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
}

/*
 * We want to put the pgtable in pmd and use pgtable for tracking
 * the base page size hptes
 */
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
				pgtable_t pgtable)
{
	pgtable_t *pgtable_slot;
	assert_spin_locked(&mm->page_table_lock);
	/*
	 * we store the pgtable in the second half of PMD
	 */
	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
	*pgtable_slot = pgtable;
	/*
	 * expose the deposited pgtable to other cpus.
	 * before we set the hugepage PTE at pmd level
	 * hash fault code looks at the deposted pgtable
	 * to store hash index values.
	 */
	smp_wmb();
}

pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
{
	pgtable_t pgtable;
	pgtable_t *pgtable_slot;

	assert_spin_locked(&mm->page_table_lock);
	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
	pgtable = *pgtable_slot;
	/*
	 * Once we withdraw, mark the entry NULL.
	 */
	*pgtable_slot = NULL;
	/*
	 * We store HPTE information in the deposited PTE fragment.
	 * zero out the content on withdraw.
	 */
	memset(pgtable, 0, PTE_FRAG_SIZE);
	return pgtable;
}

654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671
void pmdp_huge_split_prepare(struct vm_area_struct *vma,
			     unsigned long address, pmd_t *pmdp)
{
	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
	VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);

	/*
	 * We can't mark the pmd none here, because that will cause a race
	 * against exit_mmap. We need to continue mark pmd TRANS HUGE, while
	 * we spilt, but at the same time we wan't rest of the ppc64 code
	 * not to insert hash pte on this, because we will be modifying
	 * the deposited pgtable in the caller of this function. Hence
	 * clear the _PAGE_USER so that we move the fault handling to
	 * higher level function and that will serialize against ptl.
	 * We need to flush existing hash pte entries here even though,
	 * the translation is still valid, because we will withdraw
	 * pgtable_t after this.
	 */
672
	pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED);
673 674 675
}


676 677 678 679 680 681 682 683
/*
 * set a new huge pmd. We should not be called for updating
 * an existing pmd entry. That should go via pmd_hugepage_update.
 */
void set_pmd_at(struct mm_struct *mm, unsigned long addr,
		pmd_t *pmdp, pmd_t pmd)
{
#ifdef CONFIG_DEBUG_VM
684
	WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
685 686 687
	assert_spin_locked(&mm->page_table_lock);
	WARN_ON(!pmd_trans_huge(pmd));
#endif
688
	trace_hugepage_set_pmd(addr, pmd_val(pmd));
689 690 691
	return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
}

692 693 694 695
/*
 * We use this to invalidate a pmdp entry before switching from a
 * hugepte to regular pmd entry.
 */
696 697 698
void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
		     pmd_t *pmdp)
{
699
	pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
700 701 702 703 704 705

	/*
	 * This ensures that generic code that rely on IRQ disabling
	 * to prevent a parallel THP split work as expected.
	 */
	kick_all_cpus_sync();
706 707 708 709 710 711 712
}

/*
 * A linux hugepage PMD was changed and the corresponding hash table entries
 * neesd to be flushed.
 */
void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
713
			    pmd_t *pmdp, unsigned long old_pmd)
714
{
715
	int ssize;
716 717
	unsigned int psize;
	unsigned long vsid;
718
	unsigned long flags = 0;
719
	const struct cpumask *tmp;
720

721
	/* get the base page size,vsid and segment size */
722
#ifdef CONFIG_DEBUG_VM
723
	psize = get_slice_psize(mm, addr);
724 725 726 727 728 729 730
	BUG_ON(psize == MMU_PAGE_16M);
#endif
	if (old_pmd & _PAGE_COMBO)
		psize = MMU_PAGE_4K;
	else
		psize = MMU_PAGE_64K;

731 732 733
	if (!is_kernel_addr(addr)) {
		ssize = user_segment_size(addr);
		vsid = get_vsid(mm->context.id, addr, ssize);
734 735
		WARN_ON(vsid == 0);
	} else {
736
		vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
737 738
		ssize = mmu_kernel_ssize;
	}
739

740 741
	tmp = cpumask_of(smp_processor_id());
	if (cpumask_equal(mm_cpumask(mm), tmp))
742
		flags |= HPTE_LOCAL_UPDATE;
743

744
	return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
745 746 747 748
}

static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
{
749
	return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
750 751 752 753
}

pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
{
754
	unsigned long pmdv;
A
Aneesh Kumar K.V 已提交
755

756
	pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
757
	return pmd_set_protbits(__pmd(pmdv), pgprot);
758 759 760 761 762 763 764 765 766
}

pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
{
	return pfn_pmd(page_to_pfn(page), pgprot);
}

pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
767
	unsigned long pmdv;
768

769 770 771
	pmdv = pmd_val(pmd);
	pmdv &= _HPAGE_CHG_MASK;
	return pmd_set_protbits(__pmd(pmdv), newprot);
772 773 774 775 776 777 778 779 780 781 782 783 784 785
}

/*
 * This is called at the end of handling a user page fault, when the
 * fault has been handled by updating a HUGE PMD entry in the linux page tables.
 * We use it to preload an HPTE into the hash table corresponding to
 * the updated linux HUGE PMD entry.
 */
void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
			  pmd_t *pmd)
{
	return;
}

786 787
pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
			      unsigned long addr, pmd_t *pmdp)
788 789 790 791 792 793
{
	pmd_t old_pmd;
	pgtable_t pgtable;
	unsigned long old;
	pgtable_t *pgtable_slot;

794
	old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
795 796 797 798 799 800 801 802 803 804 805 806 807
	old_pmd = __pmd(old);
	/*
	 * We have pmd == none and we are holding page_table_lock.
	 * So we can safely go and clear the pgtable hash
	 * index info.
	 */
	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
	pgtable = *pgtable_slot;
	/*
	 * Let's zero out old valid and hash index details
	 * hash fault look at them.
	 */
	memset(pgtable, 0, PTE_FRAG_SIZE);
808 809 810 811 812 813 814 815 816 817 818
	/*
	 * Serialize against find_linux_pte_or_hugepte which does lock-less
	 * lookup in page tables with local interrupts disabled. For huge pages
	 * it casts pmd_t to pte_t. Since format of pte_t is different from
	 * pmd_t we want to prevent transit from pmd pointing to page table
	 * to pmd pointing to huge page (and back) while interrupts are disabled.
	 * We clear pmd to possibly replace it with page table pointer in
	 * different code paths. So make sure we wait for the parallel
	 * find_linux_pte_or_hugepage to finish.
	 */
	kick_all_cpus_sync();
819 820
	return old_pmd;
}
821 822 823

int has_transparent_hugepage(void)
{
824 825 826 827 828 829 830

	BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) >= MAX_ORDER,
		"hugepages can't be allocated by the buddy allocator");

	BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) < 2,
			 "We need more than 2 pages to do deferred thp split");

831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856
	if (!mmu_has_feature(MMU_FTR_16M_PAGE))
		return 0;
	/*
	 * We support THP only if PMD_SIZE is 16MB.
	 */
	if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
		return 0;
	/*
	 * We need to make sure that we support 16MB hugepage in a segement
	 * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
	 * of 64K.
	 */
	/*
	 * If we have 64K HPTE, we will be using that by default
	 */
	if (mmu_psize_defs[MMU_PAGE_64K].shift &&
	    (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
		return 0;
	/*
	 * Ok we only have 4K HPTE
	 */
	if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
		return 0;

	return 1;
}
857
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */