提交 70ccb92f 编写于 作者: A Andrea Arcangeli 提交者: Linus Torvalds

userfaultfd: non-cooperative: userfaultfd_remove revalidate vma in MADV_DONTNEED

userfaultfd_remove() has to be execute before zapping the pagetables or
UFFDIO_COPY could keep filling pages after zap_page_range returned,
which would result in non zero data after a MADV_DONTNEED.

However userfaultfd_remove() may have to release the mmap_sem.  This was
handled correctly in MADV_REMOVE, but MADV_DONTNEED accessed a
potentially stale vma (the very vma passed to zap_page_range(vma, ...)).

The fix consists in revalidating the vma in case userfaultfd_remove()
had to release the mmap_sem.

This also optimizes away an unnecessary down_read/up_read in the
MADV_REMOVE case if UFFD_EVENT_FORK had to be delivered.

It all remains zero runtime cost in case CONFIG_USERFAULTFD=n as
userfaultfd_remove() will be defined as "true" at build time.

Link: http://lkml.kernel.org/r/20170302173738.18994-3-aarcange@redhat.comSigned-off-by: NAndrea Arcangeli <aarcange@redhat.com>
Acked-by: NMike Rapoport <rppt@linux.vnet.ibm.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
上级 7eb76d45
...@@ -695,8 +695,7 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx, ...@@ -695,8 +695,7 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
userfaultfd_event_wait_completion(ctx, &ewq); userfaultfd_event_wait_completion(ctx, &ewq);
} }
void userfaultfd_remove(struct vm_area_struct *vma, bool userfaultfd_remove(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end) unsigned long start, unsigned long end)
{ {
struct mm_struct *mm = vma->vm_mm; struct mm_struct *mm = vma->vm_mm;
...@@ -705,13 +704,11 @@ void userfaultfd_remove(struct vm_area_struct *vma, ...@@ -705,13 +704,11 @@ void userfaultfd_remove(struct vm_area_struct *vma,
ctx = vma->vm_userfaultfd_ctx.ctx; ctx = vma->vm_userfaultfd_ctx.ctx;
if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE)) if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
return; return true;
userfaultfd_ctx_get(ctx); userfaultfd_ctx_get(ctx);
up_read(&mm->mmap_sem); up_read(&mm->mmap_sem);
*prev = NULL; /* We wait for ACK w/o the mmap semaphore */
msg_init(&ewq.msg); msg_init(&ewq.msg);
ewq.msg.event = UFFD_EVENT_REMOVE; ewq.msg.event = UFFD_EVENT_REMOVE;
...@@ -720,7 +717,7 @@ void userfaultfd_remove(struct vm_area_struct *vma, ...@@ -720,7 +717,7 @@ void userfaultfd_remove(struct vm_area_struct *vma,
userfaultfd_event_wait_completion(ctx, &ewq); userfaultfd_event_wait_completion(ctx, &ewq);
down_read(&mm->mmap_sem); return false;
} }
static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps, static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
......
...@@ -61,8 +61,7 @@ extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *, ...@@ -61,8 +61,7 @@ extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *,
unsigned long from, unsigned long to, unsigned long from, unsigned long to,
unsigned long len); unsigned long len);
extern void userfaultfd_remove(struct vm_area_struct *vma, extern bool userfaultfd_remove(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long start,
unsigned long end); unsigned long end);
...@@ -118,11 +117,11 @@ static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx, ...@@ -118,11 +117,11 @@ static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx,
{ {
} }
static inline void userfaultfd_remove(struct vm_area_struct *vma, static inline bool userfaultfd_remove(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long start,
unsigned long end) unsigned long end)
{ {
return true;
} }
static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma,
......
...@@ -513,7 +513,43 @@ static long madvise_dontneed(struct vm_area_struct *vma, ...@@ -513,7 +513,43 @@ static long madvise_dontneed(struct vm_area_struct *vma,
if (!can_madv_dontneed_vma(vma)) if (!can_madv_dontneed_vma(vma))
return -EINVAL; return -EINVAL;
userfaultfd_remove(vma, prev, start, end); if (!userfaultfd_remove(vma, start, end)) {
*prev = NULL; /* mmap_sem has been dropped, prev is stale */
down_read(&current->mm->mmap_sem);
vma = find_vma(current->mm, start);
if (!vma)
return -ENOMEM;
if (start < vma->vm_start) {
/*
* This "vma" under revalidation is the one
* with the lowest vma->vm_start where start
* is also < vma->vm_end. If start <
* vma->vm_start it means an hole materialized
* in the user address space within the
* virtual range passed to MADV_DONTNEED.
*/
return -ENOMEM;
}
if (!can_madv_dontneed_vma(vma))
return -EINVAL;
if (end > vma->vm_end) {
/*
* Don't fail if end > vma->vm_end. If the old
* vma was splitted while the mmap_sem was
* released the effect of the concurrent
* operation may not cause MADV_DONTNEED to
* have an undefined result. There may be an
* adjacent next vma that we'll walk
* next. userfaultfd_remove() will generate an
* UFFD_EVENT_REMOVE repetition on the
* end-vma->vm_end range, but the manager can
* handle a repetition fine.
*/
end = vma->vm_end;
}
VM_WARN_ON(start >= end);
}
zap_page_range(vma, start, end - start); zap_page_range(vma, start, end - start);
return 0; return 0;
} }
...@@ -554,8 +590,10 @@ static long madvise_remove(struct vm_area_struct *vma, ...@@ -554,8 +590,10 @@ static long madvise_remove(struct vm_area_struct *vma,
* mmap_sem. * mmap_sem.
*/ */
get_file(f); get_file(f);
userfaultfd_remove(vma, prev, start, end); if (userfaultfd_remove(vma, start, end)) {
up_read(&current->mm->mmap_sem); /* mmap_sem was not released by userfaultfd_remove() */
up_read(&current->mm->mmap_sem);
}
error = vfs_fallocate(f, error = vfs_fallocate(f,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
offset, end - start); offset, end - start);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册