pgtable_64.c 20.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
/*
 *  This file contains ioremap and related functions for 64-bit machines.
 *
 *  Derived from arch/ppc64/mm/init.c
 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
 *
 *  Modifications by Paul Mackerras (PowerMac) (paulus@samba.org)
 *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
 *    Copyright (C) 1996 Paul Mackerras
 *
 *  Derived from "arch/i386/mm/init.c"
 *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *
 *  Dave Engebretsen <engebret@us.ibm.com>
 *      Rework for PPC64 port.
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License
 *  as published by the Free Software Foundation; either version
 *  2 of the License, or (at your option) any later version.
 *
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
29
#include <linux/export.h>
30 31 32 33 34 35
#include <linux/types.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/stddef.h>
#include <linux/vmalloc.h>
Y
Yinghai Lu 已提交
36
#include <linux/memblock.h>
37
#include <linux/slab.h>
38
#include <linux/hugetlb.h>
39 40 41 42 43 44 45 46 47 48 49 50 51 52

#include <asm/pgalloc.h>
#include <asm/page.h>
#include <asm/prom.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/smp.h>
#include <asm/machdep.h>
#include <asm/tlb.h>
#include <asm/processor.h>
#include <asm/cputable.h>
#include <asm/sections.h>
53
#include <asm/firmware.h>
54
#include <asm/dma.h>
D
David Gibson 已提交
55 56

#include "mmu_decl.h"
57

58 59 60
#define CREATE_TRACE_POINTS
#include <trace/events/thp.h>

61
#ifdef CONFIG_PPC_STD_MMU_64
62
#if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT))
63 64 65
#error TASK_SIZE_USER64 exceeds user VSID range
#endif
#endif
66

67 68 69 70 71 72
#ifdef CONFIG_PPC_BOOK3S_64
/*
 * partition table and process table for ISA 3.0
 */
struct prtb_entry *process_tb;
struct patb_entry *partition_tb;
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
/*
 * page table size
 */
unsigned long __pte_index_size;
EXPORT_SYMBOL(__pte_index_size);
unsigned long __pmd_index_size;
EXPORT_SYMBOL(__pmd_index_size);
unsigned long __pud_index_size;
EXPORT_SYMBOL(__pud_index_size);
unsigned long __pgd_index_size;
EXPORT_SYMBOL(__pgd_index_size);
unsigned long __pmd_cache_index;
EXPORT_SYMBOL(__pmd_cache_index);
unsigned long __pte_table_size;
EXPORT_SYMBOL(__pte_table_size);
unsigned long __pmd_table_size;
EXPORT_SYMBOL(__pmd_table_size);
unsigned long __pud_table_size;
EXPORT_SYMBOL(__pud_table_size);
unsigned long __pgd_table_size;
EXPORT_SYMBOL(__pgd_table_size);
94 95 96 97 98 99
unsigned long __pmd_val_bits;
EXPORT_SYMBOL(__pmd_val_bits);
unsigned long __pud_val_bits;
EXPORT_SYMBOL(__pud_val_bits);
unsigned long __pgd_val_bits;
EXPORT_SYMBOL(__pgd_val_bits);
100

101
#endif
102
unsigned long ioremap_bot = IOREMAP_BASE;
103

104 105 106 107 108
/**
 * __ioremap_at - Low level function to establish the page tables
 *                for an IO mapping
 */
void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
109 110 111 112
			    unsigned long flags)
{
	unsigned long i;

B
Benjamin Herrenschmidt 已提交
113
	/* Make sure we have the base flags */
114 115 116
	if ((flags & _PAGE_PRESENT) == 0)
		flags |= pgprot_val(PAGE_KERNEL);

B
Benjamin Herrenschmidt 已提交
117
	/* We don't support the 4K PFN hack with ioremap */
118
	if (flags & H_PAGE_4K_PFN)
B
Benjamin Herrenschmidt 已提交
119 120
		return NULL;

121 122 123 124
	WARN_ON(pa & ~PAGE_MASK);
	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
	WARN_ON(size & ~PAGE_MASK);

125
	for (i = 0; i < size; i += PAGE_SIZE)
126
		if (map_kernel_page((unsigned long)ea+i, pa+i, flags))
127 128
			return NULL;

129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
	return (void __iomem *)ea;
}

/**
 * __iounmap_from - Low level function to tear down the page tables
 *                  for an IO mapping. This is used for mappings that
 *                  are manipulated manually, like partial unmapping of
 *                  PCI IOs or ISA space.
 */
void __iounmap_at(void *ea, unsigned long size)
{
	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
	WARN_ON(size & ~PAGE_MASK);

	unmap_kernel_range((unsigned long)ea, size);
144 145
}

146 147
void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size,
				unsigned long flags, void *caller)
148
{
149
	phys_addr_t paligned;
150 151 152 153 154 155 156 157 158 159 160
	void __iomem *ret;

	/*
	 * Choose an address to map it to.
	 * Once the imalloc system is running, we use it.
	 * Before that, we map using addresses going
	 * up from ioremap_bot.  imalloc will use
	 * the addresses from ioremap_bot through
	 * IMALLOC_END
	 * 
	 */
161 162
	paligned = addr & PAGE_MASK;
	size = PAGE_ALIGN(addr + size) - paligned;
163

164
	if ((size == 0) || (paligned == 0))
165 166
		return NULL;

167
	if (slab_is_available()) {
168
		struct vm_struct *area;
169

170 171 172
		area = __get_vm_area_caller(size, VM_IOREMAP,
					    ioremap_bot, IOREMAP_END,
					    caller);
173 174
		if (area == NULL)
			return NULL;
175 176

		area->phys_addr = paligned;
177
		ret = __ioremap_at(paligned, area->addr, size, flags);
178
		if (!ret)
179
			vunmap(area->addr);
180
	} else {
181
		ret = __ioremap_at(paligned, (void *)ioremap_bot, size, flags);
182 183 184
		if (ret)
			ioremap_bot += size;
	}
185 186 187

	if (ret)
		ret += addr & ~PAGE_MASK;
188 189 190
	return ret;
}

191 192 193 194 195
void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
			 unsigned long flags)
{
	return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
}
196

197
void __iomem * ioremap(phys_addr_t addr, unsigned long size)
198
{
199
	unsigned long flags = pgprot_val(pgprot_noncached(__pgprot(0)));
200
	void *caller = __builtin_return_address(0);
201 202

	if (ppc_md.ioremap)
203 204
		return ppc_md.ioremap(addr, size, flags, caller);
	return __ioremap_caller(addr, size, flags, caller);
205 206
}

A
Anton Blanchard 已提交
207 208
void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size)
{
209
	unsigned long flags = pgprot_val(pgprot_noncached_wc(__pgprot(0)));
A
Anton Blanchard 已提交
210 211 212 213 214 215 216
	void *caller = __builtin_return_address(0);

	if (ppc_md.ioremap)
		return ppc_md.ioremap(addr, size, flags, caller);
	return __ioremap_caller(addr, size, flags, caller);
}

A
Anton Blanchard 已提交
217
void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size,
218 219
			     unsigned long flags)
{
220 221
	void *caller = __builtin_return_address(0);

B
Benjamin Herrenschmidt 已提交
222
	/* writeable implies dirty for kernel addresses */
223
	if (flags & _PAGE_WRITE)
B
Benjamin Herrenschmidt 已提交
224 225
		flags |= _PAGE_DIRTY;

226 227 228 229 230 231 232 233 234 235 236
	/* we don't want to let _PAGE_EXEC leak out */
	flags &= ~_PAGE_EXEC;
	/*
	 * Force kernel mapping.
	 */
#if defined(CONFIG_PPC_BOOK3S_64)
	flags |= _PAGE_PRIVILEGED;
#else
	flags &= ~_PAGE_USER;
#endif

B
Benjamin Herrenschmidt 已提交
237

238 239 240 241 242 243 244 245
#ifdef _PAGE_BAP_SR
	/* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format
	 * which means that we just cleared supervisor access... oops ;-) This
	 * restores it
	 */
	flags |= _PAGE_BAP_SR;
#endif

246
	if (ppc_md.ioremap)
247 248
		return ppc_md.ioremap(addr, size, flags, caller);
	return __ioremap_caller(addr, size, flags, caller);
249 250 251
}


252 253 254 255
/*  
 * Unmap an IO region and remove it from imalloc'd list.
 * Access to IO memory should be serialized by driver.
 */
256
void __iounmap(volatile void __iomem *token)
257 258 259
{
	void *addr;

260
	if (!slab_is_available())
261 262
		return;
	
263 264 265 266 267 268 269 270
	addr = (void *) ((unsigned long __force)
			 PCI_FIX_ADDR(token) & PAGE_MASK);
	if ((unsigned long)addr < ioremap_bot) {
		printk(KERN_WARNING "Attempt to iounmap early bolted mapping"
		       " at 0x%p\n", addr);
		return;
	}
	vunmap(addr);
271 272
}

273
void iounmap(volatile void __iomem *token)
274 275 276 277 278 279 280
{
	if (ppc_md.iounmap)
		ppc_md.iounmap(token);
	else
		__iounmap(token);
}

281
EXPORT_SYMBOL(ioremap);
A
Anton Blanchard 已提交
282
EXPORT_SYMBOL(ioremap_wc);
A
Anton Blanchard 已提交
283
EXPORT_SYMBOL(ioremap_prot);
284
EXPORT_SYMBOL(__ioremap);
285
EXPORT_SYMBOL(__ioremap_at);
286
EXPORT_SYMBOL(iounmap);
287
EXPORT_SYMBOL(__iounmap);
288
EXPORT_SYMBOL(__iounmap_at);
289

290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
#ifndef __PAGETABLE_PUD_FOLDED
/* 4 level page table */
struct page *pgd_page(pgd_t pgd)
{
	if (pgd_huge(pgd))
		return pte_page(pgd_pte(pgd));
	return virt_to_page(pgd_page_vaddr(pgd));
}
#endif

struct page *pud_page(pud_t pud)
{
	if (pud_huge(pud))
		return pte_page(pud_pte(pud));
	return virt_to_page(pud_page_vaddr(pud));
}

307 308 309 310 311 312
/*
 * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags
 * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
 */
struct page *pmd_page(pmd_t pmd)
{
313
	if (pmd_trans_huge(pmd) || pmd_huge(pmd))
314
		return pte_page(pmd_pte(pmd));
315 316 317
	return virt_to_page(pmd_page_vaddr(pmd));
}

318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344
#ifdef CONFIG_PPC_64K_PAGES
static pte_t *get_from_cache(struct mm_struct *mm)
{
	void *pte_frag, *ret;

	spin_lock(&mm->page_table_lock);
	ret = mm->context.pte_frag;
	if (ret) {
		pte_frag = ret + PTE_FRAG_SIZE;
		/*
		 * If we have taken up all the fragments mark PTE page NULL
		 */
		if (((unsigned long)pte_frag & ~PAGE_MASK) == 0)
			pte_frag = NULL;
		mm->context.pte_frag = pte_frag;
	}
	spin_unlock(&mm->page_table_lock);
	return (pte_t *)ret;
}

static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
{
	void *ret = NULL;
	struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
				       __GFP_REPEAT | __GFP_ZERO);
	if (!page)
		return NULL;
345 346 347 348
	if (!kernel && !pgtable_page_ctor(page)) {
		__free_page(page);
		return NULL;
	}
349 350 351 352 353 354 355 356 357

	ret = page_address(page);
	spin_lock(&mm->page_table_lock);
	/*
	 * If we find pgtable_page set, we return
	 * the allocated page with single fragement
	 * count.
	 */
	if (likely(!mm->context.pte_frag)) {
358
		set_page_count(page, PTE_FRAG_NR);
359 360 361 362 363 364 365
		mm->context.pte_frag = ret + PTE_FRAG_SIZE;
	}
	spin_unlock(&mm->page_table_lock);

	return (pte_t *)ret;
}

366
pte_t *pte_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
367 368 369 370 371 372 373 374 375
{
	pte_t *pte;

	pte = get_from_cache(mm);
	if (pte)
		return pte;

	return __alloc_for_cache(mm, kernel);
}
376
#endif /* CONFIG_PPC_64K_PAGES */
377

378
void pte_fragment_free(unsigned long *table, int kernel)
379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404
{
	struct page *page = virt_to_page(table);
	if (put_page_testzero(page)) {
		if (!kernel)
			pgtable_page_dtor(page);
		free_hot_cold_page(page, 0);
	}
}

#ifdef CONFIG_SMP
void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
{
	unsigned long pgf = (unsigned long)table;

	BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
	pgf |= shift;
	tlb_remove_table(tlb, (void *)pgf);
}

void __tlb_remove_table(void *_table)
{
	void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
	unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;

	if (!shift)
		/* PTE page needs special handling */
405
		pte_fragment_free(table, 0);
406 407 408 409 410 411 412 413 414 415
	else {
		BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
		kmem_cache_free(PGT_CACHE(shift), table);
	}
}
#else
void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
{
	if (!shift) {
		/* PTE page needs special handling */
416
		pte_fragment_free(table, 0);
417 418 419 420 421 422
	} else {
		BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
		kmem_cache_free(PGT_CACHE(shift), table);
	}
}
#endif
423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

/*
 * This is called when relaxing access to a hugepage. It's also called in the page
 * fault path when we don't hit any of the major fault cases, ie, a minor
 * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
 * handled those two for us, we additionally deal with missing execute
 * permission here on some processors
 */
int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
			  pmd_t *pmdp, pmd_t entry, int dirty)
{
	int changed;
#ifdef CONFIG_DEBUG_VM
	WARN_ON(!pmd_trans_huge(*pmdp));
	assert_spin_locked(&vma->vm_mm->page_table_lock);
#endif
	changed = !pmd_same(*(pmdp), entry);
	if (changed) {
		__ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
		/*
		 * Since we are not supporting SW TLB systems, we don't
		 * have any thing similar to flush_tlb_page_nohash()
		 */
	}
	return changed;
}

unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
453 454
				  pmd_t *pmdp, unsigned long clr,
				  unsigned long set)
455 456
{

457 458
	__be64 old_be, tmp;
	unsigned long old;
459 460 461 462 463 464 465 466

#ifdef CONFIG_DEBUG_VM
	WARN_ON(!pmd_trans_huge(*pmdp));
	assert_spin_locked(&mm->page_table_lock);
#endif

	__asm__ __volatile__(
	"1:	ldarx	%0,0,%3\n\
467
		and.	%1,%0,%6\n\
468 469
		bne-	1b \n\
		andc	%1,%0,%4 \n\
470
		or	%1,%1,%7\n\
471 472
		stdcx.	%1,0,%3 \n\
		bne-	1b"
473 474
	: "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
	: "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
475
	  "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
476
	: "cc" );
477

478 479
	old = be64_to_cpu(old_be);

480
	trace_hugepage_update(addr, old, clr, set);
481
	if (old & H_PAGE_HASHPTE)
482
		hpte_do_hugepage_flush(mm, addr, pmdp, old);
483 484 485
	return old;
}

486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521
pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
			  pmd_t *pmdp)
{
	pmd_t pmd;

	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
	VM_BUG_ON(pmd_trans_huge(*pmdp));

	pmd = *pmdp;
	pmd_clear(pmdp);
	/*
	 * Wait for all pending hash_page to finish. This is needed
	 * in case of subpage collapse. When we collapse normal pages
	 * to hugepage, we first clear the pmd, then invalidate all
	 * the PTE entries. The assumption here is that any low level
	 * page fault will see a none pmd and take the slow path that
	 * will wait on mmap_sem. But we could very well be in a
	 * hash_page with local ptep pointer value. Such a hash page
	 * can result in adding new HPTE entries for normal subpages.
	 * That means we could be modifying the page content as we
	 * copy them to a huge page. So wait for parallel hash_page
	 * to finish before invalidating HPTE entries. We can do this
	 * by sending an IPI to all the cpus and executing a dummy
	 * function there.
	 */
	kick_all_cpus_sync();
	/*
	 * Now invalidate the hpte entries in the range
	 * covered by pmd. This make sure we take a
	 * fault and will find the pmd as none, which will
	 * result in a major fault which takes mmap_sem and
	 * hence wait for collapse to complete. Without this
	 * the __collapse_huge_page_copy can result in copying
	 * the old content.
	 */
	flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
522 523 524 525 526
	return pmd;
}

/*
 * We currently remove entries from the hashtable regardless of whether
527
 * the entry was young or dirty.
528 529 530 531
 *
 * We should be more intelligent about this but for the moment we override
 * these functions and force a tlb flush unconditionally
 */
532 533
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
			      unsigned long address, pmd_t *pmdp)
534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580
{
	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
}

/*
 * We want to put the pgtable in pmd and use pgtable for tracking
 * the base page size hptes
 */
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
				pgtable_t pgtable)
{
	pgtable_t *pgtable_slot;
	assert_spin_locked(&mm->page_table_lock);
	/*
	 * we store the pgtable in the second half of PMD
	 */
	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
	*pgtable_slot = pgtable;
	/*
	 * expose the deposited pgtable to other cpus.
	 * before we set the hugepage PTE at pmd level
	 * hash fault code looks at the deposted pgtable
	 * to store hash index values.
	 */
	smp_wmb();
}

pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
{
	pgtable_t pgtable;
	pgtable_t *pgtable_slot;

	assert_spin_locked(&mm->page_table_lock);
	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
	pgtable = *pgtable_slot;
	/*
	 * Once we withdraw, mark the entry NULL.
	 */
	*pgtable_slot = NULL;
	/*
	 * We store HPTE information in the deposited PTE fragment.
	 * zero out the content on withdraw.
	 */
	memset(pgtable, 0, PTE_FRAG_SIZE);
	return pgtable;
}

581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598
void pmdp_huge_split_prepare(struct vm_area_struct *vma,
			     unsigned long address, pmd_t *pmdp)
{
	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
	VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);

	/*
	 * We can't mark the pmd none here, because that will cause a race
	 * against exit_mmap. We need to continue mark pmd TRANS HUGE, while
	 * we spilt, but at the same time we wan't rest of the ppc64 code
	 * not to insert hash pte on this, because we will be modifying
	 * the deposited pgtable in the caller of this function. Hence
	 * clear the _PAGE_USER so that we move the fault handling to
	 * higher level function and that will serialize against ptl.
	 * We need to flush existing hash pte entries here even though,
	 * the translation is still valid, because we will withdraw
	 * pgtable_t after this.
	 */
599
	pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED);
600 601 602
}


603 604 605 606 607 608 609 610
/*
 * set a new huge pmd. We should not be called for updating
 * an existing pmd entry. That should go via pmd_hugepage_update.
 */
void set_pmd_at(struct mm_struct *mm, unsigned long addr,
		pmd_t *pmdp, pmd_t pmd)
{
#ifdef CONFIG_DEBUG_VM
611
	WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
612 613 614
	assert_spin_locked(&mm->page_table_lock);
	WARN_ON(!pmd_trans_huge(pmd));
#endif
615
	trace_hugepage_set_pmd(addr, pmd_val(pmd));
616 617 618
	return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
}

619 620 621 622
/*
 * We use this to invalidate a pmdp entry before switching from a
 * hugepte to regular pmd entry.
 */
623 624 625
void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
		     pmd_t *pmdp)
{
626
	pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
627 628 629 630 631 632

	/*
	 * This ensures that generic code that rely on IRQ disabling
	 * to prevent a parallel THP split work as expected.
	 */
	kick_all_cpus_sync();
633 634 635 636 637 638 639
}

/*
 * A linux hugepage PMD was changed and the corresponding hash table entries
 * neesd to be flushed.
 */
void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
640
			    pmd_t *pmdp, unsigned long old_pmd)
641
{
642
	int ssize;
643 644
	unsigned int psize;
	unsigned long vsid;
645
	unsigned long flags = 0;
646
	const struct cpumask *tmp;
647

648
	/* get the base page size,vsid and segment size */
649
#ifdef CONFIG_DEBUG_VM
650
	psize = get_slice_psize(mm, addr);
651 652
	BUG_ON(psize == MMU_PAGE_16M);
#endif
653
	if (old_pmd & H_PAGE_COMBO)
654 655 656 657
		psize = MMU_PAGE_4K;
	else
		psize = MMU_PAGE_64K;

658 659 660
	if (!is_kernel_addr(addr)) {
		ssize = user_segment_size(addr);
		vsid = get_vsid(mm->context.id, addr, ssize);
661 662
		WARN_ON(vsid == 0);
	} else {
663
		vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
664 665
		ssize = mmu_kernel_ssize;
	}
666

667 668
	tmp = cpumask_of(smp_processor_id());
	if (cpumask_equal(mm_cpumask(mm), tmp))
669
		flags |= HPTE_LOCAL_UPDATE;
670

671
	return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
672 673 674 675
}

static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
{
676
	return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
677 678 679 680
}

pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
{
681
	unsigned long pmdv;
A
Aneesh Kumar K.V 已提交
682

683
	pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
684
	return pmd_set_protbits(__pmd(pmdv), pgprot);
685 686 687 688 689 690 691 692 693
}

pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
{
	return pfn_pmd(page_to_pfn(page), pgprot);
}

pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
694
	unsigned long pmdv;
695

696 697 698
	pmdv = pmd_val(pmd);
	pmdv &= _HPAGE_CHG_MASK;
	return pmd_set_protbits(__pmd(pmdv), newprot);
699 700 701 702 703 704 705 706 707 708 709 710 711 712
}

/*
 * This is called at the end of handling a user page fault, when the
 * fault has been handled by updating a HUGE PMD entry in the linux page tables.
 * We use it to preload an HPTE into the hash table corresponding to
 * the updated linux HUGE PMD entry.
 */
void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
			  pmd_t *pmd)
{
	return;
}

713 714
pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
			      unsigned long addr, pmd_t *pmdp)
715 716 717 718 719 720
{
	pmd_t old_pmd;
	pgtable_t pgtable;
	unsigned long old;
	pgtable_t *pgtable_slot;

721
	old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
722 723 724 725 726 727 728 729 730 731 732 733 734
	old_pmd = __pmd(old);
	/*
	 * We have pmd == none and we are holding page_table_lock.
	 * So we can safely go and clear the pgtable hash
	 * index info.
	 */
	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
	pgtable = *pgtable_slot;
	/*
	 * Let's zero out old valid and hash index details
	 * hash fault look at them.
	 */
	memset(pgtable, 0, PTE_FRAG_SIZE);
735 736 737 738 739 740 741 742 743 744 745
	/*
	 * Serialize against find_linux_pte_or_hugepte which does lock-less
	 * lookup in page tables with local interrupts disabled. For huge pages
	 * it casts pmd_t to pte_t. Since format of pte_t is different from
	 * pmd_t we want to prevent transit from pmd pointing to page table
	 * to pmd pointing to huge page (and back) while interrupts are disabled.
	 * We clear pmd to possibly replace it with page table pointer in
	 * different code paths. So make sure we wait for the parallel
	 * find_linux_pte_or_hugepage to finish.
	 */
	kick_all_cpus_sync();
746 747
	return old_pmd;
}
748 749 750

int has_transparent_hugepage(void)
{
751

752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777
	if (!mmu_has_feature(MMU_FTR_16M_PAGE))
		return 0;
	/*
	 * We support THP only if PMD_SIZE is 16MB.
	 */
	if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
		return 0;
	/*
	 * We need to make sure that we support 16MB hugepage in a segement
	 * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
	 * of 64K.
	 */
	/*
	 * If we have 64K HPTE, we will be using that by default
	 */
	if (mmu_psize_defs[MMU_PAGE_64K].shift &&
	    (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
		return 0;
	/*
	 * Ok we only have 4K HPTE
	 */
	if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
		return 0;

	return 1;
}
778
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */