提交 936a5fe6 编写于 作者: A Andrea Arcangeli 提交者: Linus Torvalds

thp: kvm mmu transparent hugepage support

This should work for both hugetlbfs and transparent hugepages.

[akpm@linux-foundation.org: bring forward PageTransCompound() addition for bisectability]
Signed-off-by: NAndrea Arcangeli <aarcange@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
上级 47ad8475
...@@ -554,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) ...@@ -554,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
return ret; return ret;
} }
static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
{ {
struct kvm_memory_slot *slot; struct kvm_memory_slot *slot;
int host_level, level, max_level;
slot = gfn_to_memslot(vcpu->kvm, large_gfn); slot = gfn_to_memslot(vcpu->kvm, large_gfn);
if (slot && slot->dirty_bitmap) if (slot && slot->dirty_bitmap)
return PT_PAGE_TABLE_LEVEL; return true;
return false;
}
static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
{
int host_level, level, max_level;
host_level = host_mapping_level(vcpu->kvm, large_gfn); host_level = host_mapping_level(vcpu->kvm, large_gfn);
...@@ -2281,6 +2285,48 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) ...@@ -2281,6 +2285,48 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
return 1; return 1;
} }
static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
gfn_t *gfnp, pfn_t *pfnp, int *levelp)
{
pfn_t pfn = *pfnp;
gfn_t gfn = *gfnp;
int level = *levelp;
/*
* Check if it's a transparent hugepage. If this would be an
* hugetlbfs page, level wouldn't be set to
* PT_PAGE_TABLE_LEVEL and there would be no adjustment done
* here.
*/
if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
level == PT_PAGE_TABLE_LEVEL &&
PageTransCompound(pfn_to_page(pfn)) &&
!has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
unsigned long mask;
/*
* mmu_notifier_retry was successful and we hold the
* mmu_lock here, so the pmd can't become splitting
* from under us, and in turn
* __split_huge_page_refcount() can't run from under
* us and we can safely transfer the refcount from
* PG_tail to PG_head as we switch the pfn to tail to
* head.
*/
*levelp = level = PT_DIRECTORY_LEVEL;
mask = KVM_PAGES_PER_HPAGE(level) - 1;
VM_BUG_ON((gfn & mask) != (pfn & mask));
if (pfn & mask) {
gfn &= ~mask;
*gfnp = gfn;
kvm_release_pfn_clean(pfn);
pfn &= ~mask;
if (!get_page_unless_zero(pfn_to_page(pfn)))
BUG();
*pfnp = pfn;
}
}
}
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, pfn_t *pfn, bool write, bool *writable); gva_t gva, pfn_t *pfn, bool write, bool *writable);
...@@ -2289,20 +2335,25 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, ...@@ -2289,20 +2335,25 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
{ {
int r; int r;
int level; int level;
int force_pt_level;
pfn_t pfn; pfn_t pfn;
unsigned long mmu_seq; unsigned long mmu_seq;
bool map_writable; bool map_writable;
force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
if (likely(!force_pt_level)) {
level = mapping_level(vcpu, gfn); level = mapping_level(vcpu, gfn);
/* /*
* This path builds a PAE pagetable - so we can map 2mb pages at * This path builds a PAE pagetable - so we can map
* maximum. Therefore check if the level is larger than that. * 2mb pages at maximum. Therefore check if the level
* is larger than that.
*/ */
if (level > PT_DIRECTORY_LEVEL) if (level > PT_DIRECTORY_LEVEL)
level = PT_DIRECTORY_LEVEL; level = PT_DIRECTORY_LEVEL;
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
} else
level = PT_PAGE_TABLE_LEVEL;
mmu_seq = vcpu->kvm->mmu_notifier_seq; mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb(); smp_rmb();
...@@ -2318,6 +2369,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, ...@@ -2318,6 +2369,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
if (mmu_notifier_retry(vcpu, mmu_seq)) if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock; goto out_unlock;
kvm_mmu_free_some_pages(vcpu); kvm_mmu_free_some_pages(vcpu);
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
prefault); prefault);
spin_unlock(&vcpu->kvm->mmu_lock); spin_unlock(&vcpu->kvm->mmu_lock);
...@@ -2655,6 +2708,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, ...@@ -2655,6 +2708,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
pfn_t pfn; pfn_t pfn;
int r; int r;
int level; int level;
int force_pt_level;
gfn_t gfn = gpa >> PAGE_SHIFT; gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq; unsigned long mmu_seq;
int write = error_code & PFERR_WRITE_MASK; int write = error_code & PFERR_WRITE_MASK;
...@@ -2667,9 +2721,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, ...@@ -2667,9 +2721,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
if (r) if (r)
return r; return r;
force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
if (likely(!force_pt_level)) {
level = mapping_level(vcpu, gfn); level = mapping_level(vcpu, gfn);
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
} else
level = PT_PAGE_TABLE_LEVEL;
mmu_seq = vcpu->kvm->mmu_notifier_seq; mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb(); smp_rmb();
...@@ -2684,6 +2741,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, ...@@ -2684,6 +2741,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
if (mmu_notifier_retry(vcpu, mmu_seq)) if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock; goto out_unlock;
kvm_mmu_free_some_pages(vcpu); kvm_mmu_free_some_pages(vcpu);
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
r = __direct_map(vcpu, gpa, write, map_writable, r = __direct_map(vcpu, gpa, write, map_writable,
level, gfn, pfn, prefault); level, gfn, pfn, prefault);
spin_unlock(&vcpu->kvm->mmu_lock); spin_unlock(&vcpu->kvm->mmu_lock);
......
...@@ -550,6 +550,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, ...@@ -550,6 +550,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
int r; int r;
pfn_t pfn; pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL; int level = PT_PAGE_TABLE_LEVEL;
int force_pt_level;
unsigned long mmu_seq; unsigned long mmu_seq;
bool map_writable; bool map_writable;
...@@ -577,7 +578,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, ...@@ -577,7 +578,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
return 0; return 0;
} }
if (walker.level >= PT_DIRECTORY_LEVEL) { if (walker.level >= PT_DIRECTORY_LEVEL)
force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
else
force_pt_level = 1;
if (!force_pt_level) {
level = min(walker.level, mapping_level(vcpu, walker.gfn)); level = min(walker.level, mapping_level(vcpu, walker.gfn));
walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
} }
...@@ -599,6 +604,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, ...@@ -599,6 +604,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
kvm_mmu_free_some_pages(vcpu); kvm_mmu_free_some_pages(vcpu);
if (!force_pt_level)
transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
level, &write_pt, pfn, map_writable, prefault); level, &write_pt, pfn, map_writable, prefault);
(void)sptep; (void)sptep;
......
...@@ -409,6 +409,18 @@ static inline void ClearPageCompound(struct page *page) ...@@ -409,6 +409,18 @@ static inline void ClearPageCompound(struct page *page)
#endif /* !PAGEFLAGS_EXTENDED */ #endif /* !PAGEFLAGS_EXTENDED */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline int PageTransCompound(struct page *page)
{
return PageCompound(page);
}
#else
static inline int PageTransCompound(struct page *page)
{
return 0;
}
#endif
#ifdef CONFIG_MMU #ifdef CONFIG_MMU
#define __PG_MLOCKED (1 << PG_mlocked) #define __PG_MLOCKED (1 << PG_mlocked)
#else #else
......
...@@ -104,8 +104,36 @@ static pfn_t fault_pfn; ...@@ -104,8 +104,36 @@ static pfn_t fault_pfn;
inline int kvm_is_mmio_pfn(pfn_t pfn) inline int kvm_is_mmio_pfn(pfn_t pfn)
{ {
if (pfn_valid(pfn)) { if (pfn_valid(pfn)) {
struct page *page = compound_head(pfn_to_page(pfn)); struct page *head;
return PageReserved(page); struct page *tail = pfn_to_page(pfn);
head = compound_head(tail);
if (head != tail) {
smp_rmb();
/*
* head may be a dangling pointer.
* __split_huge_page_refcount clears PageTail
* before overwriting first_page, so if
* PageTail is still there it means the head
* pointer isn't dangling.
*/
if (PageTail(tail)) {
/*
* the "head" is not a dangling
* pointer but the hugepage may have
* been splitted from under us (and we
* may not hold a reference count on
* the head page so it can be reused
* before we run PageReferenced), so
* we've to recheck PageTail before
* returning what we just read.
*/
int reserved = PageReserved(head);
smp_rmb();
if (PageTail(tail))
return reserved;
}
}
return PageReserved(tail);
} }
return true; return true;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册