提交 b1dd693e 编写于 作者: D Daisuke Nishimura 提交者: Linus Torvalds

memcg: avoid deadlock between move charge and try_charge()

__mem_cgroup_try_charge() can be called under down_write(&mmap_sem)(e.g.
mlock does it). This means it can cause deadlock if it races with move charge:

Ex.1)
                move charge             |        try charge
  --------------------------------------+------------------------------
    mem_cgroup_can_attach()             |  down_write(&mmap_sem)
      mc.moving_task = current          |    ..
      mem_cgroup_precharge_mc()         |  __mem_cgroup_try_charge()
        mem_cgroup_count_precharge()    |    prepare_to_wait()
          down_read(&mmap_sem)          |    if (mc.moving_task)
          -> cannot aquire the lock     |    -> true
                                        |      schedule()

Ex.2)
                move charge             |        try charge
  --------------------------------------+------------------------------
    mem_cgroup_can_attach()             |
      mc.moving_task = current          |
      mem_cgroup_precharge_mc()         |
        mem_cgroup_count_precharge()    |
          down_read(&mmap_sem)          |
          ..                            |
          up_read(&mmap_sem)            |
                                        |  down_write(&mmap_sem)
    mem_cgroup_move_task()              |    ..
      mem_cgroup_move_charge()          |  __mem_cgroup_try_charge()
        down_read(&mmap_sem)            |    prepare_to_wait()
        -> cannot aquire the lock       |    if (mc.moving_task)
                                        |    -> true
                                        |      schedule()

To avoid this deadlock, we do all the move charge works (both can_attach() and
attach()) under one mmap_sem section.
And after this patch, we set/clear mc.moving_task outside mc.lock, because we
use the lock only to check mc.from/to.
Signed-off-by: NDaisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: NKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: <stable@kernel.org>
Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
上级 11e7946f
...@@ -278,13 +278,14 @@ enum move_type { ...@@ -278,13 +278,14 @@ enum move_type {
/* "mc" and its members are protected by cgroup_mutex */ /* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct { static struct move_charge_struct {
spinlock_t lock; /* for from, to, moving_task */ spinlock_t lock; /* for from, to */
struct mem_cgroup *from; struct mem_cgroup *from;
struct mem_cgroup *to; struct mem_cgroup *to;
unsigned long precharge; unsigned long precharge;
unsigned long moved_charge; unsigned long moved_charge;
unsigned long moved_swap; unsigned long moved_swap;
struct task_struct *moving_task; /* a task moving charges */ struct task_struct *moving_task; /* a task moving charges */
struct mm_struct *mm;
wait_queue_head_t waitq; /* a waitq for other context */ wait_queue_head_t waitq; /* a waitq for other context */
} mc = { } mc = {
.lock = __SPIN_LOCK_UNLOCKED(mc.lock), .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
...@@ -4631,7 +4632,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) ...@@ -4631,7 +4632,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
unsigned long precharge; unsigned long precharge;
struct vm_area_struct *vma; struct vm_area_struct *vma;
down_read(&mm->mmap_sem); /* We've already held the mmap_sem */
for (vma = mm->mmap; vma; vma = vma->vm_next) { for (vma = mm->mmap; vma; vma = vma->vm_next) {
struct mm_walk mem_cgroup_count_precharge_walk = { struct mm_walk mem_cgroup_count_precharge_walk = {
.pmd_entry = mem_cgroup_count_precharge_pte_range, .pmd_entry = mem_cgroup_count_precharge_pte_range,
...@@ -4643,7 +4644,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) ...@@ -4643,7 +4644,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
walk_page_range(vma->vm_start, vma->vm_end, walk_page_range(vma->vm_start, vma->vm_end,
&mem_cgroup_count_precharge_walk); &mem_cgroup_count_precharge_walk);
} }
up_read(&mm->mmap_sem);
precharge = mc.precharge; precharge = mc.precharge;
mc.precharge = 0; mc.precharge = 0;
...@@ -4694,11 +4694,16 @@ static void mem_cgroup_clear_mc(void) ...@@ -4694,11 +4694,16 @@ static void mem_cgroup_clear_mc(void)
mc.moved_swap = 0; mc.moved_swap = 0;
} }
if (mc.mm) {
up_read(&mc.mm->mmap_sem);
mmput(mc.mm);
}
spin_lock(&mc.lock); spin_lock(&mc.lock);
mc.from = NULL; mc.from = NULL;
mc.to = NULL; mc.to = NULL;
mc.moving_task = NULL;
spin_unlock(&mc.lock); spin_unlock(&mc.lock);
mc.moving_task = NULL;
mc.mm = NULL;
mem_cgroup_end_move(from); mem_cgroup_end_move(from);
memcg_oom_recover(from); memcg_oom_recover(from);
memcg_oom_recover(to); memcg_oom_recover(to);
...@@ -4724,12 +4729,21 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, ...@@ -4724,12 +4729,21 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
return 0; return 0;
/* We move charges only when we move a owner of the mm */ /* We move charges only when we move a owner of the mm */
if (mm->owner == p) { if (mm->owner == p) {
/*
* We do all the move charge works under one mmap_sem to
* avoid deadlock with down_write(&mmap_sem)
* -> try_charge() -> if (mc.moving_task) -> sleep.
*/
down_read(&mm->mmap_sem);
VM_BUG_ON(mc.from); VM_BUG_ON(mc.from);
VM_BUG_ON(mc.to); VM_BUG_ON(mc.to);
VM_BUG_ON(mc.precharge); VM_BUG_ON(mc.precharge);
VM_BUG_ON(mc.moved_charge); VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap); VM_BUG_ON(mc.moved_swap);
VM_BUG_ON(mc.moving_task); VM_BUG_ON(mc.moving_task);
VM_BUG_ON(mc.mm);
mem_cgroup_start_move(from); mem_cgroup_start_move(from);
spin_lock(&mc.lock); spin_lock(&mc.lock);
mc.from = from; mc.from = from;
...@@ -4737,14 +4751,16 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, ...@@ -4737,14 +4751,16 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
mc.precharge = 0; mc.precharge = 0;
mc.moved_charge = 0; mc.moved_charge = 0;
mc.moved_swap = 0; mc.moved_swap = 0;
mc.moving_task = current;
spin_unlock(&mc.lock); spin_unlock(&mc.lock);
mc.moving_task = current;
mc.mm = mm;
ret = mem_cgroup_precharge_mc(mm); ret = mem_cgroup_precharge_mc(mm);
if (ret) if (ret)
mem_cgroup_clear_mc(); mem_cgroup_clear_mc();
} /* We call up_read() and mmput() in clear_mc(). */
mmput(mm); } else
mmput(mm);
} }
return ret; return ret;
} }
...@@ -4832,7 +4848,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) ...@@ -4832,7 +4848,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
struct vm_area_struct *vma; struct vm_area_struct *vma;
lru_add_drain_all(); lru_add_drain_all();
down_read(&mm->mmap_sem); /* We've already held the mmap_sem */
for (vma = mm->mmap; vma; vma = vma->vm_next) { for (vma = mm->mmap; vma; vma = vma->vm_next) {
int ret; int ret;
struct mm_walk mem_cgroup_move_charge_walk = { struct mm_walk mem_cgroup_move_charge_walk = {
...@@ -4851,7 +4867,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) ...@@ -4851,7 +4867,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
*/ */
break; break;
} }
up_read(&mm->mmap_sem);
} }
static void mem_cgroup_move_task(struct cgroup_subsys *ss, static void mem_cgroup_move_task(struct cgroup_subsys *ss,
...@@ -4860,17 +4875,11 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, ...@@ -4860,17 +4875,11 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct task_struct *p, struct task_struct *p,
bool threadgroup) bool threadgroup)
{ {
struct mm_struct *mm; if (!mc.mm)
if (!mc.to)
/* no need to move charge */ /* no need to move charge */
return; return;
mm = get_task_mm(p); mem_cgroup_move_charge(mc.mm);
if (mm) {
mem_cgroup_move_charge(mm);
mmput(mm);
}
mem_cgroup_clear_mc(); mem_cgroup_clear_mc();
} }
#else /* !CONFIG_MMU */ #else /* !CONFIG_MMU */
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册