提交 c0ff7453 编写于 作者: M Miao Xie 提交者: Linus Torvalds

cpuset,mm: fix no node to alloc memory when changing cpuset's mems

Before applying this patch, cpuset updates task->mems_allowed and
mempolicy by setting all new bits in the nodemask first, and clearing all
old unallowed bits later.  But in the way, the allocator may find that
there is no node to alloc memory.

The reason is that cpuset rebinds the task's mempolicy, it cleans the
nodes which the allocater can alloc pages on, for example:

(mpol: mempolicy)
	task1			task1's mpol	task2
	alloc page		1
	  alloc on node0? NO	1
				1		change mems from 1 to 0
				1		rebind task1's mpol
				0-1		  set new bits
				0	  	  clear disallowed bits
	  alloc on node1? NO	0
	  ...
	can't alloc page
	  goto oom

This patch fixes this problem by expanding the nodes range first(set newly
allowed bits) and shrink it lazily(clear newly disallowed bits).  So we
use a variable to tell the write-side task that read-side task is reading
nodemask, and the write-side task clears newly disallowed nodes after
read-side task ends the current memory allocation.

[akpm@linux-foundation.org: fix spello]
Signed-off-by: NMiao Xie <miaox@cn.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Paul Menage <menage@google.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Ravikiran Thirumalai <kiran@scalex86.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
上级 708c1bbc
...@@ -86,9 +86,44 @@ extern void rebuild_sched_domains(void); ...@@ -86,9 +86,44 @@ extern void rebuild_sched_domains(void);
extern void cpuset_print_task_mems_allowed(struct task_struct *p); extern void cpuset_print_task_mems_allowed(struct task_struct *p);
/*
* reading current mems_allowed and mempolicy in the fastpath must protected
* by get_mems_allowed()
*/
static inline void get_mems_allowed(void)
{
current->mems_allowed_change_disable++;
/*
* ensure that reading mems_allowed and mempolicy happens after the
* update of ->mems_allowed_change_disable.
*
* the write-side task finds ->mems_allowed_change_disable is not 0,
* and knows the read-side task is reading mems_allowed or mempolicy,
* so it will clear old bits lazily.
*/
smp_mb();
}
static inline void put_mems_allowed(void)
{
/*
* ensure that reading mems_allowed and mempolicy before reducing
* mems_allowed_change_disable.
*
* the write-side task will know that the read-side task is still
* reading mems_allowed or mempolicy, don't clears old bits in the
* nodemask.
*/
smp_mb();
--ACCESS_ONCE(current->mems_allowed_change_disable);
}
static inline void set_mems_allowed(nodemask_t nodemask) static inline void set_mems_allowed(nodemask_t nodemask)
{ {
task_lock(current);
current->mems_allowed = nodemask; current->mems_allowed = nodemask;
task_unlock(current);
} }
#else /* !CONFIG_CPUSETS */ #else /* !CONFIG_CPUSETS */
...@@ -187,6 +222,14 @@ static inline void set_mems_allowed(nodemask_t nodemask) ...@@ -187,6 +222,14 @@ static inline void set_mems_allowed(nodemask_t nodemask)
{ {
} }
static inline void get_mems_allowed(void)
{
}
static inline void put_mems_allowed(void)
{
}
#endif /* !CONFIG_CPUSETS */ #endif /* !CONFIG_CPUSETS */
#endif /* _LINUX_CPUSET_H */ #endif /* _LINUX_CPUSET_H */
...@@ -1421,6 +1421,7 @@ struct task_struct { ...@@ -1421,6 +1421,7 @@ struct task_struct {
#endif #endif
#ifdef CONFIG_CPUSETS #ifdef CONFIG_CPUSETS
nodemask_t mems_allowed; /* Protected by alloc_lock */ nodemask_t mems_allowed; /* Protected by alloc_lock */
int mems_allowed_change_disable;
int cpuset_mem_spread_rotor; int cpuset_mem_spread_rotor;
#endif #endif
#ifdef CONFIG_CGROUPS #ifdef CONFIG_CGROUPS
......
...@@ -946,16 +946,62 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, ...@@ -946,16 +946,62 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
* In order to avoid seeing no nodes if the old and new nodes are disjoint, * In order to avoid seeing no nodes if the old and new nodes are disjoint,
* we structure updates as setting all new allowed nodes, then clearing newly * we structure updates as setting all new allowed nodes, then clearing newly
* disallowed ones. * disallowed ones.
*
* Called with task's alloc_lock held
*/ */
static void cpuset_change_task_nodemask(struct task_struct *tsk, static void cpuset_change_task_nodemask(struct task_struct *tsk,
nodemask_t *newmems) nodemask_t *newmems)
{ {
repeat:
/*
* Allow tasks that have access to memory reserves because they have
* been OOM killed to get memory anywhere.
*/
if (unlikely(test_thread_flag(TIF_MEMDIE)))
return;
if (current->flags & PF_EXITING) /* Let dying task have memory */
return;
task_lock(tsk);
nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
mpol_rebind_task(tsk, &tsk->mems_allowed, MPOL_REBIND_ONCE); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
mpol_rebind_task(tsk, newmems, MPOL_REBIND_ONCE);
/*
* ensure checking ->mems_allowed_change_disable after setting all new
* allowed nodes.
*
* the read-side task can see an nodemask with new allowed nodes and
* old allowed nodes. and if it allocates page when cpuset clears newly
* disallowed ones continuous, it can see the new allowed bits.
*
* And if setting all new allowed nodes is after the checking, setting
* all new allowed nodes and clearing newly disallowed ones will be done
* continuous, and the read-side task may find no node to alloc page.
*/
smp_mb();
/*
* Allocation of memory is very fast, we needn't sleep when waiting
* for the read-side.
*/
while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
task_unlock(tsk);
if (!task_curr(tsk))
yield();
goto repeat;
}
/*
* ensure checking ->mems_allowed_change_disable before clearing all new
* disallowed nodes.
*
* if clearing newly disallowed bits before the checking, the read-side
* task may find no node to alloc page.
*/
smp_mb();
mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
tsk->mems_allowed = *newmems; tsk->mems_allowed = *newmems;
task_unlock(tsk);
} }
/* /*
...@@ -978,9 +1024,7 @@ static void cpuset_change_nodemask(struct task_struct *p, ...@@ -978,9 +1024,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
cs = cgroup_cs(scan->cg); cs = cgroup_cs(scan->cg);
guarantee_online_mems(cs, newmems); guarantee_online_mems(cs, newmems);
task_lock(p);
cpuset_change_task_nodemask(p, newmems); cpuset_change_task_nodemask(p, newmems);
task_unlock(p);
NODEMASK_FREE(newmems); NODEMASK_FREE(newmems);
...@@ -1383,9 +1427,7 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, ...@@ -1383,9 +1427,7 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
err = set_cpus_allowed_ptr(tsk, cpus_attach); err = set_cpus_allowed_ptr(tsk, cpus_attach);
WARN_ON_ONCE(err); WARN_ON_ONCE(err);
task_lock(tsk);
cpuset_change_task_nodemask(tsk, to); cpuset_change_task_nodemask(tsk, to);
task_unlock(tsk);
cpuset_update_task_spread_flag(cs, tsk); cpuset_update_task_spread_flag(cs, tsk);
} }
......
...@@ -1002,8 +1002,10 @@ NORET_TYPE void do_exit(long code) ...@@ -1002,8 +1002,10 @@ NORET_TYPE void do_exit(long code)
exit_notify(tsk, group_dead); exit_notify(tsk, group_dead);
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
task_lock(tsk);
mpol_put(tsk->mempolicy); mpol_put(tsk->mempolicy);
tsk->mempolicy = NULL; tsk->mempolicy = NULL;
task_unlock(tsk);
#endif #endif
#ifdef CONFIG_FUTEX #ifdef CONFIG_FUTEX
if (unlikely(current->pi_state_cache)) if (unlikely(current->pi_state_cache))
......
...@@ -461,9 +461,15 @@ EXPORT_SYMBOL_GPL(add_to_page_cache_lru); ...@@ -461,9 +461,15 @@ EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
struct page *__page_cache_alloc(gfp_t gfp) struct page *__page_cache_alloc(gfp_t gfp)
{ {
int n;
struct page *page;
if (cpuset_do_page_mem_spread()) { if (cpuset_do_page_mem_spread()) {
int n = cpuset_mem_spread_node(); get_mems_allowed();
return alloc_pages_exact_node(n, gfp, 0); n = cpuset_mem_spread_node();
page = alloc_pages_exact_node(n, gfp, 0);
put_mems_allowed();
return page;
} }
return alloc_pages(gfp, 0); return alloc_pages(gfp, 0);
} }
......
...@@ -465,11 +465,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, ...@@ -465,11 +465,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
struct page *page = NULL; struct page *page = NULL;
struct mempolicy *mpol; struct mempolicy *mpol;
nodemask_t *nodemask; nodemask_t *nodemask;
struct zonelist *zonelist = huge_zonelist(vma, address, struct zonelist *zonelist;
htlb_alloc_mask, &mpol, &nodemask);
struct zone *zone; struct zone *zone;
struct zoneref *z; struct zoneref *z;
get_mems_allowed();
zonelist = huge_zonelist(vma, address,
htlb_alloc_mask, &mpol, &nodemask);
/* /*
* A child process with MAP_PRIVATE mappings created by their parent * A child process with MAP_PRIVATE mappings created by their parent
* have no page reserves. This check ensures that reservations are * have no page reserves. This check ensures that reservations are
...@@ -477,11 +479,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, ...@@ -477,11 +479,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
*/ */
if (!vma_has_reserves(vma) && if (!vma_has_reserves(vma) &&
h->free_huge_pages - h->resv_huge_pages == 0) h->free_huge_pages - h->resv_huge_pages == 0)
return NULL; goto err;
/* If reserves cannot be used, ensure enough pages are in the pool */ /* If reserves cannot be used, ensure enough pages are in the pool */
if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
return NULL; goto err;;
for_each_zone_zonelist_nodemask(zone, z, zonelist, for_each_zone_zonelist_nodemask(zone, z, zonelist,
MAX_NR_ZONES - 1, nodemask) { MAX_NR_ZONES - 1, nodemask) {
...@@ -500,7 +502,9 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, ...@@ -500,7 +502,9 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
break; break;
} }
} }
err:
mpol_cond_put(mpol); mpol_cond_put(mpol);
put_mems_allowed();
return page; return page;
} }
......
...@@ -1639,6 +1639,8 @@ static inline unsigned interleave_nid(struct mempolicy *pol, ...@@ -1639,6 +1639,8 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
* to the struct mempolicy for conditional unref after allocation. * to the struct mempolicy for conditional unref after allocation.
* If the effective policy is 'BIND, returns a pointer to the mempolicy's * If the effective policy is 'BIND, returns a pointer to the mempolicy's
* @nodemask for filtering the zonelist. * @nodemask for filtering the zonelist.
*
* Must be protected by get_mems_allowed()
*/ */
struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
gfp_t gfp_flags, struct mempolicy **mpol, gfp_t gfp_flags, struct mempolicy **mpol,
...@@ -1684,6 +1686,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) ...@@ -1684,6 +1686,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
if (!(mask && current->mempolicy)) if (!(mask && current->mempolicy))
return false; return false;
task_lock(current);
mempolicy = current->mempolicy; mempolicy = current->mempolicy;
switch (mempolicy->mode) { switch (mempolicy->mode) {
case MPOL_PREFERRED: case MPOL_PREFERRED:
...@@ -1703,6 +1706,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) ...@@ -1703,6 +1706,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
default: default:
BUG(); BUG();
} }
task_unlock(current);
return true; return true;
} }
...@@ -1750,13 +1754,17 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) ...@@ -1750,13 +1754,17 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
{ {
struct mempolicy *pol = get_vma_policy(current, vma, addr); struct mempolicy *pol = get_vma_policy(current, vma, addr);
struct zonelist *zl; struct zonelist *zl;
struct page *page;
get_mems_allowed();
if (unlikely(pol->mode == MPOL_INTERLEAVE)) { if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
unsigned nid; unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
mpol_cond_put(pol); mpol_cond_put(pol);
return alloc_page_interleave(gfp, 0, nid); page = alloc_page_interleave(gfp, 0, nid);
put_mems_allowed();
return page;
} }
zl = policy_zonelist(gfp, pol); zl = policy_zonelist(gfp, pol);
if (unlikely(mpol_needs_cond_ref(pol))) { if (unlikely(mpol_needs_cond_ref(pol))) {
...@@ -1766,12 +1774,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) ...@@ -1766,12 +1774,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
struct page *page = __alloc_pages_nodemask(gfp, 0, struct page *page = __alloc_pages_nodemask(gfp, 0,
zl, policy_nodemask(gfp, pol)); zl, policy_nodemask(gfp, pol));
__mpol_put(pol); __mpol_put(pol);
put_mems_allowed();
return page; return page;
} }
/* /*
* fast path: default or task policy * fast path: default or task policy
*/ */
return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
put_mems_allowed();
return page;
} }
/** /**
...@@ -1796,18 +1807,23 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) ...@@ -1796,18 +1807,23 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
struct page *alloc_pages_current(gfp_t gfp, unsigned order) struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{ {
struct mempolicy *pol = current->mempolicy; struct mempolicy *pol = current->mempolicy;
struct page *page;
if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
pol = &default_policy; pol = &default_policy;
get_mems_allowed();
/* /*
* No reference counting needed for current->mempolicy * No reference counting needed for current->mempolicy
* nor system default_policy * nor system default_policy
*/ */
if (pol->mode == MPOL_INTERLEAVE) if (pol->mode == MPOL_INTERLEAVE)
return alloc_page_interleave(gfp, order, interleave_nodes(pol)); page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
return __alloc_pages_nodemask(gfp, order, else
page = __alloc_pages_nodemask(gfp, order,
policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
put_mems_allowed();
return page;
} }
EXPORT_SYMBOL(alloc_pages_current); EXPORT_SYMBOL(alloc_pages_current);
......
...@@ -1990,10 +1990,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, ...@@ -1990,10 +1990,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (unlikely(!zonelist->_zonerefs->zone)) if (unlikely(!zonelist->_zonerefs->zone))
return NULL; return NULL;
get_mems_allowed();
/* The preferred zone is used for statistics later */ /* The preferred zone is used for statistics later */
first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
if (!preferred_zone) if (!preferred_zone) {
put_mems_allowed();
return NULL; return NULL;
}
/* First allocation attempt */ /* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
...@@ -2003,6 +2006,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, ...@@ -2003,6 +2006,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
page = __alloc_pages_slowpath(gfp_mask, order, page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask, zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype); preferred_zone, migratetype);
put_mems_allowed();
trace_mm_page_alloc(page, order, gfp_mask, migratetype); trace_mm_page_alloc(page, order, gfp_mask, migratetype);
return page; return page;
......
...@@ -3217,10 +3217,12 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) ...@@ -3217,10 +3217,12 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
if (in_interrupt() || (flags & __GFP_THISNODE)) if (in_interrupt() || (flags & __GFP_THISNODE))
return NULL; return NULL;
nid_alloc = nid_here = numa_node_id(); nid_alloc = nid_here = numa_node_id();
get_mems_allowed();
if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
nid_alloc = cpuset_mem_spread_node(); nid_alloc = cpuset_mem_spread_node();
else if (current->mempolicy) else if (current->mempolicy)
nid_alloc = slab_node(current->mempolicy); nid_alloc = slab_node(current->mempolicy);
put_mems_allowed();
if (nid_alloc != nid_here) if (nid_alloc != nid_here)
return ____cache_alloc_node(cachep, flags, nid_alloc); return ____cache_alloc_node(cachep, flags, nid_alloc);
return NULL; return NULL;
...@@ -3247,6 +3249,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) ...@@ -3247,6 +3249,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
if (flags & __GFP_THISNODE) if (flags & __GFP_THISNODE)
return NULL; return NULL;
get_mems_allowed();
zonelist = node_zonelist(slab_node(current->mempolicy), flags); zonelist = node_zonelist(slab_node(current->mempolicy), flags);
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
...@@ -3302,6 +3305,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) ...@@ -3302,6 +3305,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
} }
} }
} }
put_mems_allowed();
return obj; return obj;
} }
......
...@@ -1360,6 +1360,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) ...@@ -1360,6 +1360,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
get_cycles() % 1024 > s->remote_node_defrag_ratio) get_cycles() % 1024 > s->remote_node_defrag_ratio)
return NULL; return NULL;
get_mems_allowed();
zonelist = node_zonelist(slab_node(current->mempolicy), flags); zonelist = node_zonelist(slab_node(current->mempolicy), flags);
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
struct kmem_cache_node *n; struct kmem_cache_node *n;
...@@ -1369,10 +1370,13 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) ...@@ -1369,10 +1370,13 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
if (n && cpuset_zone_allowed_hardwall(zone, flags) && if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
n->nr_partial > s->min_partial) { n->nr_partial > s->min_partial) {
page = get_partial_node(n); page = get_partial_node(n);
if (page) if (page) {
put_mems_allowed();
return page; return page;
}
} }
} }
put_mems_allowed();
#endif #endif
return NULL; return NULL;
} }
......
...@@ -1774,6 +1774,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, ...@@ -1774,6 +1774,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
unsigned long writeback_threshold; unsigned long writeback_threshold;
get_mems_allowed();
delayacct_freepages_start(); delayacct_freepages_start();
if (scanning_global_lru(sc)) if (scanning_global_lru(sc))
...@@ -1857,6 +1858,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, ...@@ -1857,6 +1858,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
delayacct_freepages_end(); delayacct_freepages_end();
put_mems_allowed();
return ret; return ret;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册