diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9539d6654bb90e8ab7f8078dd96cdbabd21a6903..d89220cb1d9fc69942362ff3871042e469409879 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -12,12 +12,14 @@ #include #include #include +#include #include #include #include #include #include #include + #include #include #include "internal.h" @@ -47,7 +49,6 @@ static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; /* during fragmentation poll the hugepage allocator once every minute */ static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; static struct task_struct *khugepaged_thread __read_mostly; -static unsigned long huge_zero_pfn __read_mostly; static DEFINE_MUTEX(khugepaged_mutex); static DEFINE_SPINLOCK(khugepaged_mm_lock); static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); @@ -160,31 +161,74 @@ static int start_khugepaged(void) return err; } -static int init_huge_zero_pfn(void) +static atomic_t huge_zero_refcount; +static unsigned long huge_zero_pfn __read_mostly; + +static inline bool is_huge_zero_pfn(unsigned long pfn) { - struct page *hpage; - unsigned long pfn; + unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn); + return zero_pfn && pfn == zero_pfn; +} - hpage = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, +static inline bool is_huge_zero_pmd(pmd_t pmd) +{ + return is_huge_zero_pfn(pmd_pfn(pmd)); +} + +static unsigned long get_huge_zero_page(void) +{ + struct page *zero_page; +retry: + if (likely(atomic_inc_not_zero(&huge_zero_refcount))) + return ACCESS_ONCE(huge_zero_pfn); + + zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, HPAGE_PMD_ORDER); - if (!hpage) - return -ENOMEM; - pfn = page_to_pfn(hpage); - if (cmpxchg(&huge_zero_pfn, 0, pfn)) - __free_page(hpage); - return 0; + if (!zero_page) + return 0; + preempt_disable(); + if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) { + preempt_enable(); + __free_page(zero_page); + goto retry; + } + + /* We take additional reference here. It will be put back by shrinker */ + atomic_set(&huge_zero_refcount, 2); + preempt_enable(); + return ACCESS_ONCE(huge_zero_pfn); } -static inline bool is_huge_zero_pfn(unsigned long pfn) +static void put_huge_zero_page(void) { - return huge_zero_pfn && pfn == huge_zero_pfn; + /* + * Counter should never go to zero here. Only shrinker can put + * last reference. + */ + BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); } -static inline bool is_huge_zero_pmd(pmd_t pmd) +static int shrink_huge_zero_page(struct shrinker *shrink, + struct shrink_control *sc) { - return is_huge_zero_pfn(pmd_pfn(pmd)); + if (!sc->nr_to_scan) + /* we can free zero page only if last reference remains */ + return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; + + if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { + unsigned long zero_pfn = xchg(&huge_zero_pfn, 0); + BUG_ON(zero_pfn == 0); + __free_page(__pfn_to_page(zero_pfn)); + } + + return 0; } +static struct shrinker huge_zero_page_shrinker = { + .shrink = shrink_huge_zero_page, + .seeks = DEFAULT_SEEKS, +}; + #ifdef CONFIG_SYSFS static ssize_t double_flag_show(struct kobject *kobj, @@ -576,6 +620,8 @@ static int __init hugepage_init(void) goto out; } + register_shrinker(&huge_zero_page_shrinker); + /* * By default disable transparent hugepages on smaller systems, * where the extra memory used could hurt more than TLB overhead @@ -705,10 +751,11 @@ static inline struct page *alloc_hugepage(int defrag) #endif static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd) + struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, + unsigned long zero_pfn) { pmd_t entry; - entry = pfn_pmd(huge_zero_pfn, vma->vm_page_prot); + entry = pfn_pmd(zero_pfn, vma->vm_page_prot); entry = pmd_wrprotect(entry); entry = pmd_mkhuge(entry); set_pmd_at(mm, haddr, pmd, entry); @@ -731,15 +778,19 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; if (!(flags & FAULT_FLAG_WRITE)) { pgtable_t pgtable; - if (unlikely(!huge_zero_pfn && init_huge_zero_pfn())) { - count_vm_event(THP_FAULT_FALLBACK); - goto out; - } + unsigned long zero_pfn; pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; + zero_pfn = get_huge_zero_page(); + if (unlikely(!zero_pfn)) { + pte_free(mm, pgtable); + count_vm_event(THP_FAULT_FALLBACK); + goto out; + } spin_lock(&mm->page_table_lock); - set_huge_zero_page(pgtable, mm, vma, haddr, pmd); + set_huge_zero_page(pgtable, mm, vma, haddr, pmd, + zero_pfn); spin_unlock(&mm->page_table_lock); return 0; } @@ -813,7 +864,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, * a page table. */ if (is_huge_zero_pmd(pmd)) { - set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd); + unsigned long zero_pfn; + /* + * get_huge_zero_page() will never allocate a new page here, + * since we already have a zero page to copy. It just takes a + * reference. + */ + zero_pfn = get_huge_zero_page(); + set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, + zero_pfn); ret = 0; goto out_unlock; } @@ -923,6 +982,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); spin_unlock(&mm->page_table_lock); + put_huge_zero_page(); inc_mm_counter(mm, MM_ANONPAGES); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); @@ -1123,9 +1183,10 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, page_add_new_anon_rmap(new_page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); update_mmu_cache_pmd(vma, address, pmd); - if (is_huge_zero_pmd(orig_pmd)) + if (is_huge_zero_pmd(orig_pmd)) { add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); - else { + put_huge_zero_page(); + } else { VM_BUG_ON(!PageHead(page)); page_remove_rmap(page); put_page(page); @@ -1202,6 +1263,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (is_huge_zero_pmd(orig_pmd)) { tlb->mm->nr_ptes--; spin_unlock(&tlb->mm->page_table_lock); + put_huge_zero_page(); } else { page = pmd_page(orig_pmd); page_remove_rmap(page); @@ -2511,6 +2573,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, } smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); + put_huge_zero_page(); } void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,