提交 56a432f5 编写于 作者: Y Yang Shi 提交者: Caspar Zhang

alinux: mm: thp: add fast_cow switch

task #27327988

The commit ("thp: change CoW semantics for anon-THP") rewrites THP CoW
page fault handler to allocate base page only, but there is request to
keep the old behavior just in case.  So, introduce a new sysfs knob,
fast_cow, to control the behavior, the default is the new behavior.
Write that knob to 0 to switch to old behavior.
Signed-off-by: NYang Shi <yang.shi@linux.alibaba.com>
Reviewed-by: NXunlei Pang <xlpang@linux.alibaba.com>
[ caspar: fix checkpatch.pl warnings ]
Acked-by: NCaspar Zhang <caspar@linux.alibaba.com>
上级 e5b2cc5d
...@@ -61,6 +61,7 @@ enum transparent_hugepage_flag { ...@@ -61,6 +61,7 @@ enum transparent_hugepage_flag {
#ifdef CONFIG_DEBUG_VM #ifdef CONFIG_DEBUG_VM
TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG,
#endif #endif
TRANSPARENT_HUGEPAGE_FAST_COW_FLAG,
}; };
struct kobject; struct kobject;
...@@ -131,6 +132,9 @@ bool transparent_hugepage_enabled(struct vm_area_struct *vma); ...@@ -131,6 +132,9 @@ bool transparent_hugepage_enabled(struct vm_area_struct *vma);
#else /* CONFIG_DEBUG_VM */ #else /* CONFIG_DEBUG_VM */
#define transparent_hugepage_debug_cow() 0 #define transparent_hugepage_debug_cow() 0
#endif /* CONFIG_DEBUG_VM */ #endif /* CONFIG_DEBUG_VM */
#define transparent_hugepage_fast_cow() \
(transparent_hugepage_flags & \
(1<<TRANSPARENT_HUGEPAGE_FAST_COW_FLAG))
extern unsigned long thp_get_unmapped_area(struct file *filp, extern unsigned long thp_get_unmapped_area(struct file *filp,
unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long addr, unsigned long len, unsigned long pgoff,
......
...@@ -56,7 +56,8 @@ unsigned long transparent_hugepage_flags __read_mostly = ...@@ -56,7 +56,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
#endif #endif
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)| (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG)|
(1<<TRANSPARENT_HUGEPAGE_FAST_COW_FLAG);
static struct shrinker deferred_split_shrinker; static struct shrinker deferred_split_shrinker;
...@@ -325,6 +326,22 @@ static struct kobj_attribute debug_cow_attr = ...@@ -325,6 +326,22 @@ static struct kobj_attribute debug_cow_attr =
__ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
#endif /* CONFIG_DEBUG_VM */ #endif /* CONFIG_DEBUG_VM */
static ssize_t fast_cow_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return single_hugepage_flag_show(kobj, attr, buf,
TRANSPARENT_HUGEPAGE_FAST_COW_FLAG);
}
static ssize_t fast_cow_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
return single_hugepage_flag_store(kobj, attr, buf, count,
TRANSPARENT_HUGEPAGE_FAST_COW_FLAG);
}
static struct kobj_attribute fast_cow_attr =
__ATTR(fast_cow, 0644, fast_cow_show, fast_cow_store);
static struct attribute *hugepage_attr[] = { static struct attribute *hugepage_attr[] = {
&enabled_attr.attr, &enabled_attr.attr,
&defrag_attr.attr, &defrag_attr.attr,
...@@ -336,6 +353,7 @@ static struct attribute *hugepage_attr[] = { ...@@ -336,6 +353,7 @@ static struct attribute *hugepage_attr[] = {
#ifdef CONFIG_DEBUG_VM #ifdef CONFIG_DEBUG_VM
&debug_cow_attr.attr, &debug_cow_attr.attr,
#endif #endif
&fast_cow_attr.attr,
NULL, NULL,
}; };
...@@ -1184,7 +1202,7 @@ void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) ...@@ -1184,7 +1202,7 @@ void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
spin_unlock(vmf->ptl); spin_unlock(vmf->ptl);
} }
vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) static vm_fault_t do_huge_pmd_wp_page_fast(struct vm_fault *vmf, pmd_t orig_pmd)
{ {
struct vm_area_struct *vma = vmf->vma; struct vm_area_struct *vma = vmf->vma;
struct page *page; struct page *page;
...@@ -1243,6 +1261,278 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) ...@@ -1243,6 +1261,278 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
return VM_FAULT_FALLBACK; return VM_FAULT_FALLBACK;
} }
static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
pmd_t orig_pmd, struct page *page)
{
struct vm_area_struct *vma = vmf->vma;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
struct mem_cgroup *memcg;
pgtable_t pgtable;
pmd_t _pmd;
int i;
vm_fault_t ret = 0;
struct page **pages;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
GFP_KERNEL);
if (unlikely(!pages)) {
ret |= VM_FAULT_OOM;
goto out;
}
for (i = 0; i < HPAGE_PMD_NR; i++) {
pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
vmf->address, page_to_nid(page));
if (unlikely(!pages[i] ||
mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
GFP_KERNEL, &memcg, false))) {
if (pages[i])
put_page(pages[i]);
while (--i >= 0) {
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
mem_cgroup_cancel_charge(pages[i], memcg,
false);
put_page(pages[i]);
}
kfree(pages);
ret |= VM_FAULT_OOM;
goto out;
}
set_page_private(pages[i], (unsigned long)memcg);
}
for (i = 0; i < HPAGE_PMD_NR; i++) {
copy_user_highpage(pages[i], page + i,
haddr + PAGE_SIZE * i, vma);
__SetPageUptodate(pages[i]);
cond_resched();
}
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
goto out_free_pages;
VM_BUG_ON_PAGE(!PageHead(page), page);
/*
* Leave pmd empty until pte is filled note we must notify here as
* concurrent CPU thread might write to new page before the call to
* mmu_notifier_invalidate_range_end() happens which can lead to a
* device seeing memory write in different order than CPU.
*
* See Documentation/vm/mmu_notifier.rst
*/
pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
pmd_populate(vma->vm_mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
pte_t entry;
entry = mk_pte(pages[i], vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
mem_cgroup_commit_charge(pages[i], memcg, false, false);
lru_cache_add_active_or_unevictable(pages[i], vma);
vmf->pte = pte_offset_map(&_pmd, haddr);
VM_BUG_ON(!pte_none(*vmf->pte));
set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
pte_unmap(vmf->pte);
}
kfree(pages);
smp_wmb(); /* make pte visible before pmd */
pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
page_remove_rmap(page, true);
spin_unlock(vmf->ptl);
/*
* No need to double call mmu_notifier->invalidate_range() callback as
* the above pmdp_huge_clear_flush_notify() did already call it.
*/
mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
mmun_end);
ret |= VM_FAULT_WRITE;
put_page(page);
out:
return ret;
out_free_pages:
spin_unlock(vmf->ptl);
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
for (i = 0; i < HPAGE_PMD_NR; i++) {
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
mem_cgroup_cancel_charge(pages[i], memcg, false);
put_page(pages[i]);
}
kfree(pages);
goto out;
}
static vm_fault_t do_huge_pmd_wp_page_slow(struct vm_fault *vmf, pmd_t orig_pmd)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL, *new_page;
struct mem_cgroup *memcg;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
gfp_t huge_gfp; /* for allocation and charge */
vm_fault_t ret = 0;
vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
if (is_huge_zero_pmd(orig_pmd))
goto alloc;
spin_lock(vmf->ptl);
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
goto out_unlock;
page = pmd_page(orig_pmd);
VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
/*
* We can only reuse the page if nobody else maps the huge page or it's
* part.
*/
if (!trylock_page(page)) {
get_page(page);
spin_unlock(vmf->ptl);
lock_page(page);
spin_lock(vmf->ptl);
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
unlock_page(page);
put_page(page);
goto out_unlock;
}
put_page(page);
}
if (reuse_swap_page(page, NULL)) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
ret |= VM_FAULT_WRITE;
unlock_page(page);
goto out_unlock;
}
unlock_page(page);
get_page(page);
spin_unlock(vmf->ptl);
alloc:
if (__transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
huge_gfp = alloc_hugepage_direct_gfpmask(vma);
new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
} else
new_page = NULL;
if (likely(new_page)) {
prep_transhuge_page(new_page);
} else {
if (!page) {
split_huge_pmd(vma, vmf->pmd, vmf->address);
ret |= VM_FAULT_FALLBACK;
} else {
ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
if (ret & VM_FAULT_OOM) {
split_huge_pmd(vma, vmf->pmd, vmf->address);
ret |= VM_FAULT_FALLBACK;
}
put_page(page);
}
count_vm_event(THP_FAULT_FALLBACK);
goto out;
}
if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
huge_gfp, &memcg, true))) {
put_page(new_page);
split_huge_pmd(vma, vmf->pmd, vmf->address);
if (page)
put_page(page);
ret |= VM_FAULT_FALLBACK;
count_vm_event(THP_FAULT_FALLBACK);
goto out;
}
count_vm_event(THP_FAULT_ALLOC);
if (!page)
clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
else
copy_user_huge_page(new_page, page, vmf->address,
vma, HPAGE_PMD_NR);
__SetPageUptodate(new_page);
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
spin_lock(vmf->ptl);
if (page)
put_page(page);
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
spin_unlock(vmf->ptl);
mem_cgroup_cancel_charge(new_page, memcg, true);
put_page(new_page);
goto out_mn;
} else {
pmd_t entry;
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
page_add_new_anon_rmap(new_page, vma, haddr, true);
mem_cgroup_commit_charge(new_page, memcg, false, true);
lru_cache_add_active_or_unevictable(new_page, vma);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
if (!page) {
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
} else {
VM_BUG_ON_PAGE(!PageHead(page), page);
page_remove_rmap(page, true);
put_page(page);
}
ret |= VM_FAULT_WRITE;
}
spin_unlock(vmf->ptl);
out_mn:
/*
* No need to double call mmu_notifier->invalidate_range() callback as
* the above pmdp_huge_clear_flush_notify() did already call it.
*/
mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
mmun_end);
out:
return ret;
out_unlock:
spin_unlock(vmf->ptl);
return ret;
}
vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
{
if (transparent_hugepage_fast_cow())
return do_huge_pmd_wp_page_fast(vmf, orig_pmd);
return do_huge_pmd_wp_page_slow(vmf, orig_pmd);
}
/* /*
* FOLL_FORCE can write to even unwritable pmd's, but only * FOLL_FORCE can write to even unwritable pmd's, but only
* after we've gone through a COW cycle and they are dirty. * after we've gone through a COW cycle and they are dirty.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册