提交 bae473a4 编写于 作者: K Kirill A. Shutemov 提交者: Linus Torvalds

mm: introduce fault_env

The idea borrowed from Peter's patch from patchset on speculative page
faults[1]:

Instead of passing around the endless list of function arguments,
replace the lot with a single structure so we can change context without
endless function signature changes.

The changes are mostly mechanical with exception of faultaround code:
filemap_map_pages() got reworked a bit.

This patch is preparation for the next one.

[1] http://lkml.kernel.org/r/20141020222841.302891540@infradead.org

Link: http://lkml.kernel.org/r/1466021202-61880-9-git-send-email-kirill.shutemov@linux.intel.comSigned-off-by: NKirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: NPeter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
上级 dcddffd4
...@@ -548,13 +548,13 @@ subsequent truncate), and then return with VM_FAULT_LOCKED, and the page ...@@ -548,13 +548,13 @@ subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
locked. The VM will unlock the page. locked. The VM will unlock the page.
->map_pages() is called when VM asks to map easy accessible pages. ->map_pages() is called when VM asks to map easy accessible pages.
Filesystem should find and map pages associated with offsets from "pgoff" Filesystem should find and map pages associated with offsets from "start_pgoff"
till "max_pgoff". ->map_pages() is called with page table locked and must till "end_pgoff". ->map_pages() is called with page table locked and must
not block. If it's not possible to reach a page without blocking, not block. If it's not possible to reach a page without blocking,
filesystem should skip it. Filesystem should use do_set_pte() to setup filesystem should skip it. Filesystem should use do_set_pte() to setup
page table entry. Pointer to entry associated with offset "pgoff" is page table entry. Pointer to entry associated with the page is passed in
passed in "pte" field in vm_fault structure. Pointers to entries for other "pte" field in fault_env structure. Pointers to entries for other offsets
offsets should be calculated relative to "pte". should be calculated relative to "pte".
->page_mkwrite() is called when a previously read-only pte is ->page_mkwrite() is called when a previously read-only pte is
about to become writeable. The filesystem again must ensure that there are about to become writeable. The filesystem again must ensure that there are
......
...@@ -257,10 +257,9 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, ...@@ -257,10 +257,9 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
* fatal_signal_pending()s, and the mmap_sem must be released before * fatal_signal_pending()s, and the mmap_sem must be released before
* returning it. * returning it.
*/ */
int handle_userfault(struct vm_area_struct *vma, unsigned long address, int handle_userfault(struct fault_env *fe, unsigned long reason)
unsigned int flags, unsigned long reason)
{ {
struct mm_struct *mm = vma->vm_mm; struct mm_struct *mm = fe->vma->vm_mm;
struct userfaultfd_ctx *ctx; struct userfaultfd_ctx *ctx;
struct userfaultfd_wait_queue uwq; struct userfaultfd_wait_queue uwq;
int ret; int ret;
...@@ -269,7 +268,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, ...@@ -269,7 +268,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
ret = VM_FAULT_SIGBUS; ret = VM_FAULT_SIGBUS;
ctx = vma->vm_userfaultfd_ctx.ctx; ctx = fe->vma->vm_userfaultfd_ctx.ctx;
if (!ctx) if (!ctx)
goto out; goto out;
...@@ -302,17 +301,17 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, ...@@ -302,17 +301,17 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
* without first stopping userland access to the memory. For * without first stopping userland access to the memory. For
* VM_UFFD_MISSING userfaults this is enough for now. * VM_UFFD_MISSING userfaults this is enough for now.
*/ */
if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) { if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) {
/* /*
* Validate the invariant that nowait must allow retry * Validate the invariant that nowait must allow retry
* to be sure not to return SIGBUS erroneously on * to be sure not to return SIGBUS erroneously on
* nowait invocations. * nowait invocations.
*/ */
BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT); BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT);
#ifdef CONFIG_DEBUG_VM #ifdef CONFIG_DEBUG_VM
if (printk_ratelimit()) { if (printk_ratelimit()) {
printk(KERN_WARNING printk(KERN_WARNING
"FAULT_FLAG_ALLOW_RETRY missing %x\n", flags); "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags);
dump_stack(); dump_stack();
} }
#endif #endif
...@@ -324,7 +323,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, ...@@ -324,7 +323,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
* and wait. * and wait.
*/ */
ret = VM_FAULT_RETRY; ret = VM_FAULT_RETRY;
if (flags & FAULT_FLAG_RETRY_NOWAIT) if (fe->flags & FAULT_FLAG_RETRY_NOWAIT)
goto out; goto out;
/* take the reference before dropping the mmap_sem */ /* take the reference before dropping the mmap_sem */
...@@ -332,10 +331,11 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, ...@@ -332,10 +331,11 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current; uwq.wq.private = current;
uwq.msg = userfault_msg(address, flags, reason); uwq.msg = userfault_msg(fe->address, fe->flags, reason);
uwq.ctx = ctx; uwq.ctx = ctx;
return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == return_to_userland =
(fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
(FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
spin_lock(&ctx->fault_pending_wqh.lock); spin_lock(&ctx->fault_pending_wqh.lock);
...@@ -353,7 +353,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, ...@@ -353,7 +353,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
TASK_KILLABLE); TASK_KILLABLE);
spin_unlock(&ctx->fault_pending_wqh.lock); spin_unlock(&ctx->fault_pending_wqh.lock);
must_wait = userfaultfd_must_wait(ctx, address, flags, reason); must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason);
up_read(&mm->mmap_sem); up_read(&mm->mmap_sem);
if (likely(must_wait && !ACCESS_ONCE(ctx->released) && if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
......
#ifndef _LINUX_HUGE_MM_H #ifndef _LINUX_HUGE_MM_H
#define _LINUX_HUGE_MM_H #define _LINUX_HUGE_MM_H
extern int do_huge_pmd_anonymous_page(struct mm_struct *mm, extern int do_huge_pmd_anonymous_page(struct fault_env *fe);
struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
unsigned int flags);
extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *vma); struct vm_area_struct *vma);
extern void huge_pmd_set_accessed(struct mm_struct *mm, extern void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd);
struct vm_area_struct *vma, extern int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd);
unsigned long address, pmd_t *pmd,
pmd_t orig_pmd, int dirty);
extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pmd_t orig_pmd);
extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned long addr, unsigned long addr,
pmd_t *pmd, pmd_t *pmd,
...@@ -134,8 +126,7 @@ static inline int hpage_nr_pages(struct page *page) ...@@ -134,8 +126,7 @@ static inline int hpage_nr_pages(struct page *page)
return 1; return 1;
} }
extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, extern int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd);
unsigned long addr, pmd_t pmd, pmd_t *pmdp);
extern struct page *huge_zero_page; extern struct page *huge_zero_page;
...@@ -196,8 +187,7 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, ...@@ -196,8 +187,7 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
return NULL; return NULL;
} }
static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, static inline int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd)
unsigned long addr, pmd_t pmd, pmd_t *pmdp)
{ {
return 0; return 0;
} }
......
...@@ -309,10 +309,27 @@ struct vm_fault { ...@@ -309,10 +309,27 @@ struct vm_fault {
* VM_FAULT_DAX_LOCKED and fill in * VM_FAULT_DAX_LOCKED and fill in
* entry here. * entry here.
*/ */
/* for ->map_pages() only */ };
pgoff_t max_pgoff; /* map pages for offset from pgoff till
* max_pgoff inclusive */ /*
pte_t *pte; /* pte entry associated with ->pgoff */ * Page fault context: passes though page fault handler instead of endless list
* of function arguments.
*/
struct fault_env {
struct vm_area_struct *vma; /* Target VMA */
unsigned long address; /* Faulting virtual address */
unsigned int flags; /* FAULT_FLAG_xxx flags */
pmd_t *pmd; /* Pointer to pmd entry matching
* the 'address'
*/
pte_t *pte; /* Pointer to pte entry matching
* the 'address'. NULL if the page
* table hasn't been allocated.
*/
spinlock_t *ptl; /* Page table lock.
* Protects pte page table if 'pte'
* is not NULL, otherwise pmd.
*/
}; };
/* /*
...@@ -327,7 +344,8 @@ struct vm_operations_struct { ...@@ -327,7 +344,8 @@ struct vm_operations_struct {
int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
int (*pmd_fault)(struct vm_area_struct *, unsigned long address, int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
pmd_t *, unsigned int flags); pmd_t *, unsigned int flags);
void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf); void (*map_pages)(struct fault_env *fe,
pgoff_t start_pgoff, pgoff_t end_pgoff);
/* notification that a previously read-only page is about to become /* notification that a previously read-only page is about to become
* writable, if an error is returned it will cause a SIGBUS */ * writable, if an error is returned it will cause a SIGBUS */
...@@ -600,8 +618,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) ...@@ -600,8 +618,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
return pte; return pte;
} }
void do_set_pte(struct vm_area_struct *vma, unsigned long address, void do_set_pte(struct fault_env *fe, struct page *page);
struct page *page, pte_t *pte, bool write, bool anon);
#endif #endif
/* /*
...@@ -2062,7 +2079,8 @@ extern void truncate_inode_pages_final(struct address_space *); ...@@ -2062,7 +2079,8 @@ extern void truncate_inode_pages_final(struct address_space *);
/* generic vm_area_ops exported for stackable file systems */ /* generic vm_area_ops exported for stackable file systems */
extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
extern void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf); extern void filemap_map_pages(struct fault_env *fe,
pgoff_t start_pgoff, pgoff_t end_pgoff);
extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
/* mm/page-writeback.c */ /* mm/page-writeback.c */
......
...@@ -27,8 +27,7 @@ ...@@ -27,8 +27,7 @@
#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
extern int handle_userfault(struct vm_area_struct *vma, unsigned long address, extern int handle_userfault(struct fault_env *fe, unsigned long reason);
unsigned int flags, unsigned long reason);
extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long src_start, unsigned long len); unsigned long src_start, unsigned long len);
...@@ -56,10 +55,7 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) ...@@ -56,10 +55,7 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
#else /* CONFIG_USERFAULTFD */ #else /* CONFIG_USERFAULTFD */
/* mm helpers */ /* mm helpers */
static inline int handle_userfault(struct vm_area_struct *vma, static inline int handle_userfault(struct fault_env *fe, unsigned long reason)
unsigned long address,
unsigned int flags,
unsigned long reason)
{ {
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
} }
......
...@@ -2128,22 +2128,27 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -2128,22 +2128,27 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
} }
EXPORT_SYMBOL(filemap_fault); EXPORT_SYMBOL(filemap_fault);
void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) void filemap_map_pages(struct fault_env *fe,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{ {
struct radix_tree_iter iter; struct radix_tree_iter iter;
void **slot; void **slot;
struct file *file = vma->vm_file; struct file *file = fe->vma->vm_file;
struct address_space *mapping = file->f_mapping; struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
loff_t size; loff_t size;
struct page *page; struct page *page;
unsigned long address = (unsigned long) vmf->virtual_address;
unsigned long addr;
pte_t *pte;
rcu_read_lock(); rcu_read_lock();
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) { radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
if (iter.index > vmf->max_pgoff) start_pgoff) {
if (iter.index > end_pgoff)
break; break;
fe->pte += iter.index - last_pgoff;
fe->address += (iter.index - last_pgoff) << PAGE_SHIFT;
last_pgoff = iter.index;
if (!pte_none(*fe->pte))
goto next;
repeat: repeat:
page = radix_tree_deref_slot(slot); page = radix_tree_deref_slot(slot);
if (unlikely(!page)) if (unlikely(!page))
...@@ -2179,14 +2184,9 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -2179,14 +2184,9 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
if (page->index >= size >> PAGE_SHIFT) if (page->index >= size >> PAGE_SHIFT)
goto unlock; goto unlock;
pte = vmf->pte + page->index - vmf->pgoff;
if (!pte_none(*pte))
goto unlock;
if (file->f_ra.mmap_miss > 0) if (file->f_ra.mmap_miss > 0)
file->f_ra.mmap_miss--; file->f_ra.mmap_miss--;
addr = address + (page->index - vmf->pgoff) * PAGE_SIZE; do_set_pte(fe, page);
do_set_pte(vma, addr, page, pte, false, false);
unlock_page(page); unlock_page(page);
goto next; goto next;
unlock: unlock:
...@@ -2194,7 +2194,7 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -2194,7 +2194,7 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
skip: skip:
put_page(page); put_page(page);
next: next:
if (iter.index == vmf->max_pgoff) if (iter.index == end_pgoff)
break; break;
} }
rcu_read_unlock(); rcu_read_unlock();
......
...@@ -821,26 +821,23 @@ void prep_transhuge_page(struct page *page) ...@@ -821,26 +821,23 @@ void prep_transhuge_page(struct page *page)
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
} }
static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
struct vm_area_struct *vma, gfp_t gfp)
unsigned long address, pmd_t *pmd,
struct page *page, gfp_t gfp,
unsigned int flags)
{ {
struct vm_area_struct *vma = fe->vma;
struct mem_cgroup *memcg; struct mem_cgroup *memcg;
pgtable_t pgtable; pgtable_t pgtable;
spinlock_t *ptl; unsigned long haddr = fe->address & HPAGE_PMD_MASK;
unsigned long haddr = address & HPAGE_PMD_MASK;
VM_BUG_ON_PAGE(!PageCompound(page), page); VM_BUG_ON_PAGE(!PageCompound(page), page);
if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) { if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) {
put_page(page); put_page(page);
count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK; return VM_FAULT_FALLBACK;
} }
pgtable = pte_alloc_one(mm, haddr); pgtable = pte_alloc_one(vma->vm_mm, haddr);
if (unlikely(!pgtable)) { if (unlikely(!pgtable)) {
mem_cgroup_cancel_charge(page, memcg, true); mem_cgroup_cancel_charge(page, memcg, true);
put_page(page); put_page(page);
...@@ -855,12 +852,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, ...@@ -855,12 +852,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
*/ */
__SetPageUptodate(page); __SetPageUptodate(page);
ptl = pmd_lock(mm, pmd); fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
if (unlikely(!pmd_none(*pmd))) { if (unlikely(!pmd_none(*fe->pmd))) {
spin_unlock(ptl); spin_unlock(fe->ptl);
mem_cgroup_cancel_charge(page, memcg, true); mem_cgroup_cancel_charge(page, memcg, true);
put_page(page); put_page(page);
pte_free(mm, pgtable); pte_free(vma->vm_mm, pgtable);
} else { } else {
pmd_t entry; pmd_t entry;
...@@ -868,12 +865,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, ...@@ -868,12 +865,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
if (userfaultfd_missing(vma)) { if (userfaultfd_missing(vma)) {
int ret; int ret;
spin_unlock(ptl); spin_unlock(fe->ptl);
mem_cgroup_cancel_charge(page, memcg, true); mem_cgroup_cancel_charge(page, memcg, true);
put_page(page); put_page(page);
pte_free(mm, pgtable); pte_free(vma->vm_mm, pgtable);
ret = handle_userfault(vma, address, flags, ret = handle_userfault(fe, VM_UFFD_MISSING);
VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK); VM_BUG_ON(ret & VM_FAULT_FALLBACK);
return ret; return ret;
} }
...@@ -883,11 +879,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, ...@@ -883,11 +879,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
page_add_new_anon_rmap(page, vma, haddr, true); page_add_new_anon_rmap(page, vma, haddr, true);
mem_cgroup_commit_charge(page, memcg, false, true); mem_cgroup_commit_charge(page, memcg, false, true);
lru_cache_add_active_or_unevictable(page, vma); lru_cache_add_active_or_unevictable(page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable); pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry); set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
atomic_long_inc(&mm->nr_ptes); atomic_long_inc(&vma->vm_mm->nr_ptes);
spin_unlock(ptl); spin_unlock(fe->ptl);
count_vm_event(THP_FAULT_ALLOC); count_vm_event(THP_FAULT_ALLOC);
} }
...@@ -937,13 +933,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, ...@@ -937,13 +933,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
return true; return true;
} }
int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, int do_huge_pmd_anonymous_page(struct fault_env *fe)
unsigned long address, pmd_t *pmd,
unsigned int flags)
{ {
struct vm_area_struct *vma = fe->vma;
gfp_t gfp; gfp_t gfp;
struct page *page; struct page *page;
unsigned long haddr = address & HPAGE_PMD_MASK; unsigned long haddr = fe->address & HPAGE_PMD_MASK;
if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
return VM_FAULT_FALLBACK; return VM_FAULT_FALLBACK;
...@@ -951,42 +946,40 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -951,42 +946,40 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM; return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma, vma->vm_flags))) if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
return VM_FAULT_OOM; return VM_FAULT_OOM;
if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm) && if (!(fe->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm) &&
transparent_hugepage_use_zero_page()) { transparent_hugepage_use_zero_page()) {
spinlock_t *ptl;
pgtable_t pgtable; pgtable_t pgtable;
struct page *zero_page; struct page *zero_page;
bool set; bool set;
int ret; int ret;
pgtable = pte_alloc_one(mm, haddr); pgtable = pte_alloc_one(vma->vm_mm, haddr);
if (unlikely(!pgtable)) if (unlikely(!pgtable))
return VM_FAULT_OOM; return VM_FAULT_OOM;
zero_page = get_huge_zero_page(); zero_page = get_huge_zero_page();
if (unlikely(!zero_page)) { if (unlikely(!zero_page)) {
pte_free(mm, pgtable); pte_free(vma->vm_mm, pgtable);
count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK; return VM_FAULT_FALLBACK;
} }
ptl = pmd_lock(mm, pmd); fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
ret = 0; ret = 0;
set = false; set = false;
if (pmd_none(*pmd)) { if (pmd_none(*fe->pmd)) {
if (userfaultfd_missing(vma)) { if (userfaultfd_missing(vma)) {
spin_unlock(ptl); spin_unlock(fe->ptl);
ret = handle_userfault(vma, address, flags, ret = handle_userfault(fe, VM_UFFD_MISSING);
VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK); VM_BUG_ON(ret & VM_FAULT_FALLBACK);
} else { } else {
set_huge_zero_page(pgtable, mm, vma, set_huge_zero_page(pgtable, vma->vm_mm, vma,
haddr, pmd, haddr, fe->pmd, zero_page);
zero_page); spin_unlock(fe->ptl);
spin_unlock(ptl);
set = true; set = true;
} }
} else } else
spin_unlock(ptl); spin_unlock(fe->ptl);
if (!set) { if (!set) {
pte_free(mm, pgtable); pte_free(vma->vm_mm, pgtable);
put_huge_zero_page(); put_huge_zero_page();
} }
return ret; return ret;
...@@ -998,8 +991,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -998,8 +991,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_FALLBACK; return VM_FAULT_FALLBACK;
} }
prep_transhuge_page(page); prep_transhuge_page(page);
return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, return __do_huge_pmd_anonymous_page(fe, page, gfp);
flags);
} }
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
...@@ -1172,38 +1164,31 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ...@@ -1172,38 +1164,31 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
return ret; return ret;
} }
void huge_pmd_set_accessed(struct mm_struct *mm, void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd)
struct vm_area_struct *vma,
unsigned long address,
pmd_t *pmd, pmd_t orig_pmd,
int dirty)
{ {
spinlock_t *ptl;
pmd_t entry; pmd_t entry;
unsigned long haddr; unsigned long haddr;
ptl = pmd_lock(mm, pmd); fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd);
if (unlikely(!pmd_same(*pmd, orig_pmd))) if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
goto unlock; goto unlock;
entry = pmd_mkyoung(orig_pmd); entry = pmd_mkyoung(orig_pmd);
haddr = address & HPAGE_PMD_MASK; haddr = fe->address & HPAGE_PMD_MASK;
if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry,
update_mmu_cache_pmd(vma, address, pmd); fe->flags & FAULT_FLAG_WRITE))
update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd);
unlock: unlock:
spin_unlock(ptl); spin_unlock(fe->ptl);
} }
static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
struct vm_area_struct *vma, struct page *page)
unsigned long address,
pmd_t *pmd, pmd_t orig_pmd,
struct page *page,
unsigned long haddr)
{ {
struct vm_area_struct *vma = fe->vma;
unsigned long haddr = fe->address & HPAGE_PMD_MASK;
struct mem_cgroup *memcg; struct mem_cgroup *memcg;
spinlock_t *ptl;
pgtable_t pgtable; pgtable_t pgtable;
pmd_t _pmd; pmd_t _pmd;
int ret = 0, i; int ret = 0, i;
...@@ -1220,11 +1205,11 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, ...@@ -1220,11 +1205,11 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
for (i = 0; i < HPAGE_PMD_NR; i++) { for (i = 0; i < HPAGE_PMD_NR; i++) {
pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
__GFP_OTHER_NODE, __GFP_OTHER_NODE, vma,
vma, address, page_to_nid(page)); fe->address, page_to_nid(page));
if (unlikely(!pages[i] || if (unlikely(!pages[i] ||
mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL, mem_cgroup_try_charge(pages[i], vma->vm_mm,
&memcg, false))) { GFP_KERNEL, &memcg, false))) {
if (pages[i]) if (pages[i])
put_page(pages[i]); put_page(pages[i]);
while (--i >= 0) { while (--i >= 0) {
...@@ -1250,41 +1235,41 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, ...@@ -1250,41 +1235,41 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
mmun_start = haddr; mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE; mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
ptl = pmd_lock(mm, pmd); fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
if (unlikely(!pmd_same(*pmd, orig_pmd))) if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
goto out_free_pages; goto out_free_pages;
VM_BUG_ON_PAGE(!PageHead(page), page); VM_BUG_ON_PAGE(!PageHead(page), page);
pmdp_huge_clear_flush_notify(vma, haddr, pmd); pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
/* leave pmd empty until pte is filled */ /* leave pmd empty until pte is filled */
pgtable = pgtable_trans_huge_withdraw(mm, pmd); pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd);
pmd_populate(mm, &_pmd, pgtable); pmd_populate(vma->vm_mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
pte_t *pte, entry; pte_t entry;
entry = mk_pte(pages[i], vma->vm_page_prot); entry = mk_pte(pages[i], vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma); entry = maybe_mkwrite(pte_mkdirty(entry), vma);
memcg = (void *)page_private(pages[i]); memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0); set_page_private(pages[i], 0);
page_add_new_anon_rmap(pages[i], vma, haddr, false); page_add_new_anon_rmap(pages[i], fe->vma, haddr, false);
mem_cgroup_commit_charge(pages[i], memcg, false, false); mem_cgroup_commit_charge(pages[i], memcg, false, false);
lru_cache_add_active_or_unevictable(pages[i], vma); lru_cache_add_active_or_unevictable(pages[i], vma);
pte = pte_offset_map(&_pmd, haddr); fe->pte = pte_offset_map(&_pmd, haddr);
VM_BUG_ON(!pte_none(*pte)); VM_BUG_ON(!pte_none(*fe->pte));
set_pte_at(mm, haddr, pte, entry); set_pte_at(vma->vm_mm, haddr, fe->pte, entry);
pte_unmap(pte); pte_unmap(fe->pte);
} }
kfree(pages); kfree(pages);
smp_wmb(); /* make pte visible before pmd */ smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable); pmd_populate(vma->vm_mm, fe->pmd, pgtable);
page_remove_rmap(page, true); page_remove_rmap(page, true);
spin_unlock(ptl); spin_unlock(fe->ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
ret |= VM_FAULT_WRITE; ret |= VM_FAULT_WRITE;
put_page(page); put_page(page);
...@@ -1293,8 +1278,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, ...@@ -1293,8 +1278,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
return ret; return ret;
out_free_pages: out_free_pages:
spin_unlock(ptl); spin_unlock(fe->ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
for (i = 0; i < HPAGE_PMD_NR; i++) { for (i = 0; i < HPAGE_PMD_NR; i++) {
memcg = (void *)page_private(pages[i]); memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0); set_page_private(pages[i], 0);
...@@ -1305,25 +1290,23 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, ...@@ -1305,25 +1290,23 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
goto out; goto out;
} }
int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd)
unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
{ {
spinlock_t *ptl; struct vm_area_struct *vma = fe->vma;
int ret = 0;
struct page *page = NULL, *new_page; struct page *page = NULL, *new_page;
struct mem_cgroup *memcg; struct mem_cgroup *memcg;
unsigned long haddr; unsigned long haddr = fe->address & HPAGE_PMD_MASK;
unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */
gfp_t huge_gfp; /* for allocation and charge */ gfp_t huge_gfp; /* for allocation and charge */
int ret = 0;
ptl = pmd_lockptr(mm, pmd); fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma); VM_BUG_ON_VMA(!vma->anon_vma, vma);
haddr = address & HPAGE_PMD_MASK;
if (is_huge_zero_pmd(orig_pmd)) if (is_huge_zero_pmd(orig_pmd))
goto alloc; goto alloc;
spin_lock(ptl); spin_lock(fe->ptl);
if (unlikely(!pmd_same(*pmd, orig_pmd))) if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
goto out_unlock; goto out_unlock;
page = pmd_page(orig_pmd); page = pmd_page(orig_pmd);
...@@ -1336,13 +1319,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1336,13 +1319,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
pmd_t entry; pmd_t entry;
entry = pmd_mkyoung(orig_pmd); entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1))
update_mmu_cache_pmd(vma, address, pmd); update_mmu_cache_pmd(vma, fe->address, fe->pmd);
ret |= VM_FAULT_WRITE; ret |= VM_FAULT_WRITE;
goto out_unlock; goto out_unlock;
} }
get_page(page); get_page(page);
spin_unlock(ptl); spin_unlock(fe->ptl);
alloc: alloc:
if (transparent_hugepage_enabled(vma) && if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) { !transparent_hugepage_debug_cow()) {
...@@ -1355,13 +1338,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1355,13 +1338,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
prep_transhuge_page(new_page); prep_transhuge_page(new_page);
} else { } else {
if (!page) { if (!page) {
split_huge_pmd(vma, pmd, address); split_huge_pmd(vma, fe->pmd, fe->address);
ret |= VM_FAULT_FALLBACK; ret |= VM_FAULT_FALLBACK;
} else { } else {
ret = do_huge_pmd_wp_page_fallback(mm, vma, address, ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page);
pmd, orig_pmd, page, haddr);
if (ret & VM_FAULT_OOM) { if (ret & VM_FAULT_OOM) {
split_huge_pmd(vma, pmd, address); split_huge_pmd(vma, fe->pmd, fe->address);
ret |= VM_FAULT_FALLBACK; ret |= VM_FAULT_FALLBACK;
} }
put_page(page); put_page(page);
...@@ -1370,14 +1352,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1370,14 +1352,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out; goto out;
} }
if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg, if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
true))) { huge_gfp, &memcg, true))) {
put_page(new_page); put_page(new_page);
if (page) { split_huge_pmd(vma, fe->pmd, fe->address);
split_huge_pmd(vma, pmd, address); if (page)
put_page(page); put_page(page);
} else
split_huge_pmd(vma, pmd, address);
ret |= VM_FAULT_FALLBACK; ret |= VM_FAULT_FALLBACK;
count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK);
goto out; goto out;
...@@ -1393,13 +1373,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1393,13 +1373,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
mmun_start = haddr; mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE; mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
spin_lock(ptl); spin_lock(fe->ptl);
if (page) if (page)
put_page(page); put_page(page);
if (unlikely(!pmd_same(*pmd, orig_pmd))) { if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) {
spin_unlock(ptl); spin_unlock(fe->ptl);
mem_cgroup_cancel_charge(new_page, memcg, true); mem_cgroup_cancel_charge(new_page, memcg, true);
put_page(new_page); put_page(new_page);
goto out_mn; goto out_mn;
...@@ -1407,14 +1387,14 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1407,14 +1387,14 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
pmd_t entry; pmd_t entry;
entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
pmdp_huge_clear_flush_notify(vma, haddr, pmd); pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
page_add_new_anon_rmap(new_page, vma, haddr, true); page_add_new_anon_rmap(new_page, vma, haddr, true);
mem_cgroup_commit_charge(new_page, memcg, false, true); mem_cgroup_commit_charge(new_page, memcg, false, true);
lru_cache_add_active_or_unevictable(new_page, vma); lru_cache_add_active_or_unevictable(new_page, vma);
set_pmd_at(mm, haddr, pmd, entry); set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
update_mmu_cache_pmd(vma, address, pmd); update_mmu_cache_pmd(vma, fe->address, fe->pmd);
if (!page) { if (!page) {
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
put_huge_zero_page(); put_huge_zero_page();
} else { } else {
VM_BUG_ON_PAGE(!PageHead(page), page); VM_BUG_ON_PAGE(!PageHead(page), page);
...@@ -1423,13 +1403,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1423,13 +1403,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
} }
ret |= VM_FAULT_WRITE; ret |= VM_FAULT_WRITE;
} }
spin_unlock(ptl); spin_unlock(fe->ptl);
out_mn: out_mn:
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
out: out:
return ret; return ret;
out_unlock: out_unlock:
spin_unlock(ptl); spin_unlock(fe->ptl);
return ret; return ret;
} }
...@@ -1489,13 +1469,12 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, ...@@ -1489,13 +1469,12 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
} }
/* NUMA hinting page fault entry point for trans huge pmds */ /* NUMA hinting page fault entry point for trans huge pmds */
int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
unsigned long addr, pmd_t pmd, pmd_t *pmdp)
{ {
spinlock_t *ptl; struct vm_area_struct *vma = fe->vma;
struct anon_vma *anon_vma = NULL; struct anon_vma *anon_vma = NULL;
struct page *page; struct page *page;
unsigned long haddr = addr & HPAGE_PMD_MASK; unsigned long haddr = fe->address & HPAGE_PMD_MASK;
int page_nid = -1, this_nid = numa_node_id(); int page_nid = -1, this_nid = numa_node_id();
int target_nid, last_cpupid = -1; int target_nid, last_cpupid = -1;
bool page_locked; bool page_locked;
...@@ -1506,8 +1485,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1506,8 +1485,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* A PROT_NONE fault should not end up here */ /* A PROT_NONE fault should not end up here */
BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
ptl = pmd_lock(mm, pmdp); fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
if (unlikely(!pmd_same(pmd, *pmdp))) if (unlikely(!pmd_same(pmd, *fe->pmd)))
goto out_unlock; goto out_unlock;
/* /*
...@@ -1515,9 +1494,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1515,9 +1494,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
* without disrupting NUMA hinting information. Do not relock and * without disrupting NUMA hinting information. Do not relock and
* check_same as the page may no longer be mapped. * check_same as the page may no longer be mapped.
*/ */
if (unlikely(pmd_trans_migrating(*pmdp))) { if (unlikely(pmd_trans_migrating(*fe->pmd))) {
page = pmd_page(*pmdp); page = pmd_page(*fe->pmd);
spin_unlock(ptl); spin_unlock(fe->ptl);
wait_on_page_locked(page); wait_on_page_locked(page);
goto out; goto out;
} }
...@@ -1550,7 +1529,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1550,7 +1529,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Migration could have started since the pmd_trans_migrating check */ /* Migration could have started since the pmd_trans_migrating check */
if (!page_locked) { if (!page_locked) {
spin_unlock(ptl); spin_unlock(fe->ptl);
wait_on_page_locked(page); wait_on_page_locked(page);
page_nid = -1; page_nid = -1;
goto out; goto out;
...@@ -1561,12 +1540,12 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1561,12 +1540,12 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
* to serialises splits * to serialises splits
*/ */
get_page(page); get_page(page);
spin_unlock(ptl); spin_unlock(fe->ptl);
anon_vma = page_lock_anon_vma_read(page); anon_vma = page_lock_anon_vma_read(page);
/* Confirm the PMD did not change while page_table_lock was released */ /* Confirm the PMD did not change while page_table_lock was released */
spin_lock(ptl); spin_lock(fe->ptl);
if (unlikely(!pmd_same(pmd, *pmdp))) { if (unlikely(!pmd_same(pmd, *fe->pmd))) {
unlock_page(page); unlock_page(page);
put_page(page); put_page(page);
page_nid = -1; page_nid = -1;
...@@ -1584,9 +1563,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1584,9 +1563,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
* Migrate the THP to the requested node, returns with page unlocked * Migrate the THP to the requested node, returns with page unlocked
* and access rights restored. * and access rights restored.
*/ */
spin_unlock(ptl); spin_unlock(fe->ptl);
migrated = migrate_misplaced_transhuge_page(mm, vma, migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
pmdp, pmd, addr, page, target_nid); fe->pmd, pmd, fe->address, page, target_nid);
if (migrated) { if (migrated) {
flags |= TNF_MIGRATED; flags |= TNF_MIGRATED;
page_nid = target_nid; page_nid = target_nid;
...@@ -1601,18 +1580,18 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1601,18 +1580,18 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
pmd = pmd_mkyoung(pmd); pmd = pmd_mkyoung(pmd);
if (was_writable) if (was_writable)
pmd = pmd_mkwrite(pmd); pmd = pmd_mkwrite(pmd);
set_pmd_at(mm, haddr, pmdp, pmd); set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd);
update_mmu_cache_pmd(vma, addr, pmdp); update_mmu_cache_pmd(vma, fe->address, fe->pmd);
unlock_page(page); unlock_page(page);
out_unlock: out_unlock:
spin_unlock(ptl); spin_unlock(fe->ptl);
out: out:
if (anon_vma) if (anon_vma)
page_unlock_anon_vma_read(anon_vma); page_unlock_anon_vma_read(anon_vma);
if (page_nid != -1) if (page_nid != -1)
task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags);
return 0; return 0;
} }
...@@ -2413,20 +2392,23 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, ...@@ -2413,20 +2392,23 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd) unsigned long address, pmd_t *pmd)
{ {
unsigned long _address; pte_t pteval;
pte_t *pte, pteval;
int swapped_in = 0, ret = 0; int swapped_in = 0, ret = 0;
struct fault_env fe = {
pte = pte_offset_map(pmd, address); .vma = vma,
for (_address = address; _address < address + HPAGE_PMD_NR*PAGE_SIZE; .address = address,
pte++, _address += PAGE_SIZE) { .flags = FAULT_FLAG_ALLOW_RETRY,
pteval = *pte; .pmd = pmd,
};
fe.pte = pte_offset_map(pmd, address);
for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE;
fe.pte++, fe.address += PAGE_SIZE) {
pteval = *fe.pte;
if (!is_swap_pte(pteval)) if (!is_swap_pte(pteval))
continue; continue;
swapped_in++; swapped_in++;
ret = do_swap_page(mm, vma, _address, pte, pmd, ret = do_swap_page(&fe, pteval);
FAULT_FLAG_ALLOW_RETRY,
pteval);
/* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
if (ret & VM_FAULT_RETRY) { if (ret & VM_FAULT_RETRY) {
down_read(&mm->mmap_sem); down_read(&mm->mmap_sem);
...@@ -2442,10 +2424,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, ...@@ -2442,10 +2424,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
return false; return false;
} }
/* pte is unmapped now, we need to map it */ /* pte is unmapped now, we need to map it */
pte = pte_offset_map(pmd, _address); fe.pte = pte_offset_map(pmd, fe.address);
} }
pte--; fe.pte--;
pte_unmap(pte); pte_unmap(fe.pte);
trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1); trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1);
return true; return true;
} }
......
...@@ -36,9 +36,7 @@ ...@@ -36,9 +36,7 @@
/* Do not use these with a slab allocator */ /* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
extern int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, int do_swap_page(struct fault_env *fe, pte_t orig_pte);
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling); unsigned long floor, unsigned long ceiling);
......
此差异已折叠。
...@@ -1809,7 +1809,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -1809,7 +1809,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
} }
EXPORT_SYMBOL(filemap_fault); EXPORT_SYMBOL(filemap_fault);
void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) void filemap_map_pages(struct fault_env *fe,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{ {
BUG(); BUG();
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册