提交 a8bef8ff 编写于 作者: M Mel Gorman 提交者: Linus Torvalds

mm: migration: avoid race between shift_arg_pages() and rmap_walk() during...

mm: migration: avoid race between shift_arg_pages() and rmap_walk() during migration by not migrating temporary stacks

Page migration requires rmap to be able to find all ptes mapping a page
at all times, otherwise the migration entry can be instantiated, but it
is possible to leave one behind if the second rmap_walk fails to find
the page.  If this page is later faulted, migration_entry_to_page() will
call BUG because the page is locked indicating the page was migrated by
the migration PTE not cleaned up. For example

  kernel BUG at include/linux/swapops.h:105!
  invalid opcode: 0000 [#1] PREEMPT SMP
  ...
  Call Trace:
   [<ffffffff810e951a>] handle_mm_fault+0x3f8/0x76a
   [<ffffffff8130c7a2>] do_page_fault+0x44a/0x46e
   [<ffffffff813099b5>] page_fault+0x25/0x30
   [<ffffffff8114de33>] load_elf_binary+0x152a/0x192b
   [<ffffffff8111329b>] search_binary_handler+0x173/0x313
   [<ffffffff81114896>] do_execve+0x219/0x30a
   [<ffffffff8100a5c6>] sys_execve+0x43/0x5e
   [<ffffffff8100320a>] stub_execve+0x6a/0xc0
  RIP  [<ffffffff811094ff>] migration_entry_wait+0xc1/0x129

There is a race between shift_arg_pages and migration that triggers this
bug.  A temporary stack is setup during exec and later moved.  If
migration moves a page in the temporary stack and the VMA is then removed
before migration completes, the migration PTE may not be found leading to
a BUG when the stack is faulted.

This patch causes pages within the temporary stack during exec to be
skipped by migration.  It does this by marking the VMA covering the
temporary stack with an otherwise impossible combination of VMA flags.
These flags are cleared when the temporary stack is moved to its final
location.

[kamezawa.hiroyu@jp.fujitsu.com: idea for having migration skip temporary stacks]
Signed-off-by: NMel Gorman <mel@csn.ul.ie>
Reviewed-by: NKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: NRik van Riel <riel@redhat.com>
Acked-by: NLinus Torvalds <torvalds@linux-foundation.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: NKOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
上级 e9e96b39
...@@ -242,9 +242,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm) ...@@ -242,9 +242,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
* use STACK_TOP because that can depend on attributes which aren't * use STACK_TOP because that can depend on attributes which aren't
* configured yet. * configured yet.
*/ */
BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
vma->vm_end = STACK_TOP_MAX; vma->vm_end = STACK_TOP_MAX;
vma->vm_start = vma->vm_end - PAGE_SIZE; vma->vm_start = vma->vm_end - PAGE_SIZE;
vma->vm_flags = VM_STACK_FLAGS; vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
INIT_LIST_HEAD(&vma->anon_vma_chain); INIT_LIST_HEAD(&vma->anon_vma_chain);
err = insert_vm_struct(mm, vma); err = insert_vm_struct(mm, vma);
...@@ -616,6 +617,7 @@ int setup_arg_pages(struct linux_binprm *bprm, ...@@ -616,6 +617,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
else if (executable_stack == EXSTACK_DISABLE_X) else if (executable_stack == EXSTACK_DISABLE_X)
vm_flags &= ~VM_EXEC; vm_flags &= ~VM_EXEC;
vm_flags |= mm->def_flags; vm_flags |= mm->def_flags;
vm_flags |= VM_STACK_INCOMPLETE_SETUP;
ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
vm_flags); vm_flags);
...@@ -630,6 +632,9 @@ int setup_arg_pages(struct linux_binprm *bprm, ...@@ -630,6 +632,9 @@ int setup_arg_pages(struct linux_binprm *bprm,
goto out_unlock; goto out_unlock;
} }
/* mprotect_fixup is overkill to remove the temporary stack flags */
vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
stack_size = vma->vm_end - vma->vm_start; stack_size = vma->vm_end - vma->vm_start;
/* /*
......
...@@ -106,6 +106,9 @@ extern unsigned int kobjsize(const void *objp); ...@@ -106,6 +106,9 @@ extern unsigned int kobjsize(const void *objp);
#define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ #define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */
#define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */
/* Bits set in the VMA until the stack is in its final location */
#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ)
#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
#endif #endif
......
...@@ -1131,6 +1131,20 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, ...@@ -1131,6 +1131,20 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
return ret; return ret;
} }
static bool is_vma_temporary_stack(struct vm_area_struct *vma)
{
int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
if (!maybe_stack)
return false;
if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
VM_STACK_INCOMPLETE_SETUP)
return true;
return false;
}
/** /**
* try_to_unmap_anon - unmap or unlock anonymous page using the object-based * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
* rmap method * rmap method
...@@ -1159,7 +1173,21 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) ...@@ -1159,7 +1173,21 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
struct vm_area_struct *vma = avc->vma; struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma); unsigned long address;
/*
* During exec, a temporary VMA is setup and later moved.
* The VMA is moved under the anon_vma lock but not the
* page tables leading to a race where migration cannot
* find the migration ptes. Rather than increasing the
* locking requirements of exec(), migration skips
* temporary VMAs until after exec() completes.
*/
if (PAGE_MIGRATION && (flags & TTU_MIGRATION) &&
is_vma_temporary_stack(vma))
continue;
address = vma_address(page, vma);
if (address == -EFAULT) if (address == -EFAULT)
continue; continue;
ret = try_to_unmap_one(page, vma, address, flags); ret = try_to_unmap_one(page, vma, address, flags);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册