From 56a432f556a5ee87668a1e100403414040ea49d9 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 16 Jun 2020 10:00:58 +0800 Subject: [PATCH] alinux: mm: thp: add fast_cow switch task #27327988 The commit ("thp: change CoW semantics for anon-THP") rewrites THP CoW page fault handler to allocate base page only, but there is request to keep the old behavior just in case. So, introduce a new sysfs knob, fast_cow, to control the behavior, the default is the new behavior. Write that knob to 0 to switch to old behavior. Signed-off-by: Yang Shi Reviewed-by: Xunlei Pang [ caspar: fix checkpatch.pl warnings ] Acked-by: Caspar Zhang --- include/linux/huge_mm.h | 4 + mm/huge_memory.c | 294 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 296 insertions(+), 2 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7a447e8e8481..d71e30785f03 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -61,6 +61,7 @@ enum transparent_hugepage_flag { #ifdef CONFIG_DEBUG_VM TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, #endif + TRANSPARENT_HUGEPAGE_FAST_COW_FLAG, }; struct kobject; @@ -131,6 +132,9 @@ bool transparent_hugepage_enabled(struct vm_area_struct *vma); #else /* CONFIG_DEBUG_VM */ #define transparent_hugepage_debug_cow() 0 #endif /* CONFIG_DEBUG_VM */ +#define transparent_hugepage_fast_cow() \ + (transparent_hugepage_flags & \ + (1<ptl); } -vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) +static vm_fault_t do_huge_pmd_wp_page_fast(struct vm_fault *vmf, pmd_t orig_pmd) { struct vm_area_struct *vma = vmf->vma; struct page *page; @@ -1243,6 +1261,278 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) return VM_FAULT_FALLBACK; } +static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, + pmd_t orig_pmd, struct page *page) +{ + struct vm_area_struct *vma = vmf->vma; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + struct mem_cgroup *memcg; + pgtable_t pgtable; + pmd_t _pmd; + int i; + vm_fault_t ret = 0; + struct page **pages; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + + pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *), + GFP_KERNEL); + if (unlikely(!pages)) { + ret |= VM_FAULT_OOM; + goto out; + } + + for (i = 0; i < HPAGE_PMD_NR; i++) { + pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma, + vmf->address, page_to_nid(page)); + if (unlikely(!pages[i] || + mem_cgroup_try_charge_delay(pages[i], vma->vm_mm, + GFP_KERNEL, &memcg, false))) { + if (pages[i]) + put_page(pages[i]); + while (--i >= 0) { + memcg = (void *)page_private(pages[i]); + set_page_private(pages[i], 0); + mem_cgroup_cancel_charge(pages[i], memcg, + false); + put_page(pages[i]); + } + kfree(pages); + ret |= VM_FAULT_OOM; + goto out; + } + set_page_private(pages[i], (unsigned long)memcg); + } + + for (i = 0; i < HPAGE_PMD_NR; i++) { + copy_user_highpage(pages[i], page + i, + haddr + PAGE_SIZE * i, vma); + __SetPageUptodate(pages[i]); + cond_resched(); + } + + mmun_start = haddr; + mmun_end = haddr + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); + + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) + goto out_free_pages; + VM_BUG_ON_PAGE(!PageHead(page), page); + + /* + * Leave pmd empty until pte is filled note we must notify here as + * concurrent CPU thread might write to new page before the call to + * mmu_notifier_invalidate_range_end() happens which can lead to a + * device seeing memory write in different order than CPU. + * + * See Documentation/vm/mmu_notifier.rst + */ + pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); + + pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); + pmd_populate(vma->vm_mm, &_pmd, pgtable); + + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + pte_t entry; + + entry = mk_pte(pages[i], vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + memcg = (void *)page_private(pages[i]); + set_page_private(pages[i], 0); + page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false); + mem_cgroup_commit_charge(pages[i], memcg, false, false); + lru_cache_add_active_or_unevictable(pages[i], vma); + vmf->pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*vmf->pte)); + set_pte_at(vma->vm_mm, haddr, vmf->pte, entry); + pte_unmap(vmf->pte); + } + kfree(pages); + + smp_wmb(); /* make pte visible before pmd */ + pmd_populate(vma->vm_mm, vmf->pmd, pgtable); + page_remove_rmap(page, true); + spin_unlock(vmf->ptl); + + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above pmdp_huge_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, + mmun_end); + + ret |= VM_FAULT_WRITE; + put_page(page); + +out: + return ret; + +out_free_pages: + spin_unlock(vmf->ptl); + mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); + for (i = 0; i < HPAGE_PMD_NR; i++) { + memcg = (void *)page_private(pages[i]); + set_page_private(pages[i], 0); + mem_cgroup_cancel_charge(pages[i], memcg, false); + put_page(pages[i]); + } + kfree(pages); + goto out; +} + +static vm_fault_t do_huge_pmd_wp_page_slow(struct vm_fault *vmf, pmd_t orig_pmd) +{ + struct vm_area_struct *vma = vmf->vma; + struct page *page = NULL, *new_page; + struct mem_cgroup *memcg; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + gfp_t huge_gfp; /* for allocation and charge */ + vm_fault_t ret = 0; + + vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); + VM_BUG_ON_VMA(!vma->anon_vma, vma); + if (is_huge_zero_pmd(orig_pmd)) + goto alloc; + spin_lock(vmf->ptl); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) + goto out_unlock; + + page = pmd_page(orig_pmd); + VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); + /* + * We can only reuse the page if nobody else maps the huge page or it's + * part. + */ + if (!trylock_page(page)) { + get_page(page); + spin_unlock(vmf->ptl); + lock_page(page); + spin_lock(vmf->ptl); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + unlock_page(page); + put_page(page); + goto out_unlock; + } + put_page(page); + } + if (reuse_swap_page(page, NULL)) { + pmd_t entry; + + entry = pmd_mkyoung(orig_pmd); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); + ret |= VM_FAULT_WRITE; + unlock_page(page); + goto out_unlock; + } + unlock_page(page); + get_page(page); + spin_unlock(vmf->ptl); +alloc: + if (__transparent_hugepage_enabled(vma) && + !transparent_hugepage_debug_cow()) { + huge_gfp = alloc_hugepage_direct_gfpmask(vma); + new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); + } else + new_page = NULL; + + if (likely(new_page)) { + prep_transhuge_page(new_page); + } else { + if (!page) { + split_huge_pmd(vma, vmf->pmd, vmf->address); + ret |= VM_FAULT_FALLBACK; + } else { + ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page); + if (ret & VM_FAULT_OOM) { + split_huge_pmd(vma, vmf->pmd, vmf->address); + ret |= VM_FAULT_FALLBACK; + } + put_page(page); + } + count_vm_event(THP_FAULT_FALLBACK); + goto out; + } + + if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm, + huge_gfp, &memcg, true))) { + put_page(new_page); + split_huge_pmd(vma, vmf->pmd, vmf->address); + if (page) + put_page(page); + ret |= VM_FAULT_FALLBACK; + count_vm_event(THP_FAULT_FALLBACK); + goto out; + } + + count_vm_event(THP_FAULT_ALLOC); + + if (!page) + clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); + else + copy_user_huge_page(new_page, page, vmf->address, + vma, HPAGE_PMD_NR); + __SetPageUptodate(new_page); + + mmun_start = haddr; + mmun_end = haddr + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); + + spin_lock(vmf->ptl); + if (page) + put_page(page); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + spin_unlock(vmf->ptl); + mem_cgroup_cancel_charge(new_page, memcg, true); + put_page(new_page); + goto out_mn; + } else { + pmd_t entry; + + entry = mk_huge_pmd(new_page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); + page_add_new_anon_rmap(new_page, vma, haddr, true); + mem_cgroup_commit_charge(new_page, memcg, false, true); + lru_cache_add_active_or_unevictable(new_page, vma); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); + if (!page) { + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + } else { + VM_BUG_ON_PAGE(!PageHead(page), page); + page_remove_rmap(page, true); + put_page(page); + } + ret |= VM_FAULT_WRITE; + } + spin_unlock(vmf->ptl); +out_mn: + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above pmdp_huge_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, + mmun_end); +out: + return ret; +out_unlock: + spin_unlock(vmf->ptl); + return ret; +} + +vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) +{ + if (transparent_hugepage_fast_cow()) + return do_huge_pmd_wp_page_fast(vmf, orig_pmd); + + return do_huge_pmd_wp_page_slow(vmf, orig_pmd); +} + /* * FOLL_FORCE can write to even unwritable pmd's, but only * after we've gone through a COW cycle and they are dirty. -- GitLab