提交 dac1d27b 编写于 作者: M Mel Gorman 提交者: Linus Torvalds

mm: use zonelists instead of zones when direct reclaiming pages

The following patches replace multiple zonelists per node with two zonelists
that are filtered based on the GFP flags.  The patches as a set fix a bug with
regard to the use of MPOL_BIND and ZONE_MOVABLE.  With this patchset, the
MPOL_BIND will apply to the two highest zones when the highest zone is
ZONE_MOVABLE.  This should be considered as an alternative fix for the
MPOL_BIND+ZONE_MOVABLE in 2.6.23 to the previously discussed hack that filters
only custom zonelists.

The first patch cleans up an inconsistency where direct reclaim uses
zonelist->zones where other places use zonelist.

The second patch introduces a helper function node_zonelist() for looking up
the appropriate zonelist for a GFP mask which simplifies patches later in the
set.

The third patch defines/remembers the "preferred zone" for numa statistics, as
it is no longer always the first zone in a zonelist.

The forth patch replaces multiple zonelists with two zonelists that are
filtered.  The two zonelists are due to the fact that the memoryless patchset
introduces a second set of zonelists for __GFP_THISNODE.

The fifth patch introduces helper macros for retrieving the zone and node
indices of entries in a zonelist.

The final patch introduces filtering of the zonelists based on a nodemask.
Two zonelists exist per node, one for normal allocations and one for
__GFP_THISNODE.

Performance results varied depending on the machine configuration.  In real
workloads the gain/loss will depend on how much the userspace portion of the
benchmark benefits from having more cache available due to reduced referencing
of zonelists.

These are the range of performance losses/gains when running against
2.6.24-rc4-mm1.  The set and these machines are a mix of i386, x86_64 and
ppc64 both NUMA and non-NUMA.
			     loss   to  gain
Total CPU time on Kernbench: -0.86% to  1.13%
Elapsed   time on Kernbench: -0.79% to  0.76%
page_test from aim9:         -4.37% to  0.79%
brk_test  from aim9:         -0.71% to  4.07%
fork_test from aim9:         -1.84% to  4.60%
exec_test from aim9:         -0.71% to  1.08%

This patch:

The allocator deals with zonelists which indicate the order in which zones
should be targeted for an allocation.  Similarly, direct reclaim of pages
iterates over an array of zones.  For consistency, this patch converts direct
reclaim to use a zonelist.  No functionality is changed by this patch.  This
simplifies zonelist iterators in the next patch.
Signed-off-by: NMel Gorman <mel@csn.ul.ie>
Acked-by: NChristoph Lameter <clameter@sgi.com>
Signed-off-by: NLee Schermerhorn <lee.schermerhorn@hp.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
上级 9d02dbc8
...@@ -360,16 +360,16 @@ void invalidate_bdev(struct block_device *bdev) ...@@ -360,16 +360,16 @@ void invalidate_bdev(struct block_device *bdev)
*/ */
static void free_more_memory(void) static void free_more_memory(void)
{ {
struct zone **zones; struct zonelist *zonelist;
pg_data_t *pgdat; pg_data_t *pgdat;
wakeup_pdflush(1024); wakeup_pdflush(1024);
yield(); yield();
for_each_online_pgdat(pgdat) { for_each_online_pgdat(pgdat) {
zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones; zonelist = &pgdat->node_zonelists[gfp_zone(GFP_NOFS)];
if (*zones) if (zonelist->zones[0])
try_to_free_pages(zones, 0, GFP_NOFS); try_to_free_pages(zonelist, 0, GFP_NOFS);
} }
} }
......
...@@ -181,7 +181,7 @@ extern int rotate_reclaimable_page(struct page *page); ...@@ -181,7 +181,7 @@ extern int rotate_reclaimable_page(struct page *page);
extern void swap_setup(void); extern void swap_setup(void);
/* linux/mm/vmscan.c */ /* linux/mm/vmscan.c */
extern unsigned long try_to_free_pages(struct zone **zones, int order, extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask); gfp_t gfp_mask);
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
gfp_t gfp_mask); gfp_t gfp_mask);
......
...@@ -1569,7 +1569,7 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, ...@@ -1569,7 +1569,7 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
reclaim_state.reclaimed_slab = 0; reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state; p->reclaim_state = &reclaim_state;
did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask); did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
p->reclaim_state = NULL; p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC; p->flags &= ~PF_MEMALLOC;
......
...@@ -1246,10 +1246,11 @@ static unsigned long shrink_zone(int priority, struct zone *zone, ...@@ -1246,10 +1246,11 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
* If a zone is deemed to be full of pinned pages then just give it a light * If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it. * scan then give up on it.
*/ */
static unsigned long shrink_zones(int priority, struct zone **zones, static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
struct scan_control *sc) struct scan_control *sc)
{ {
unsigned long nr_reclaimed = 0; unsigned long nr_reclaimed = 0;
struct zone **zones = zonelist->zones;
int i; int i;
...@@ -1301,8 +1302,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones, ...@@ -1301,8 +1302,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
* holds filesystem locks which prevent writeout this might not work, and the * holds filesystem locks which prevent writeout this might not work, and the
* allocation attempt will fail. * allocation attempt will fail.
*/ */
static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc) gfp_t gfp_mask, struct scan_control *sc)
{ {
int priority; int priority;
int ret = 0; int ret = 0;
...@@ -1310,6 +1311,7 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, ...@@ -1310,6 +1311,7 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
unsigned long nr_reclaimed = 0; unsigned long nr_reclaimed = 0;
struct reclaim_state *reclaim_state = current->reclaim_state; struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long lru_pages = 0; unsigned long lru_pages = 0;
struct zone **zones = zonelist->zones;
int i; int i;
if (scan_global_lru(sc)) if (scan_global_lru(sc))
...@@ -1333,7 +1335,7 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, ...@@ -1333,7 +1335,7 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
sc->nr_scanned = 0; sc->nr_scanned = 0;
if (!priority) if (!priority)
disable_swap_token(); disable_swap_token();
nr_reclaimed += shrink_zones(priority, zones, sc); nr_reclaimed += shrink_zones(priority, zonelist, sc);
/* /*
* Don't shrink slabs when reclaiming memory from * Don't shrink slabs when reclaiming memory from
* over limit cgroups * over limit cgroups
...@@ -1397,7 +1399,8 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, ...@@ -1397,7 +1399,8 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
return ret; return ret;
} }
unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask)
{ {
struct scan_control sc = { struct scan_control sc = {
.gfp_mask = gfp_mask, .gfp_mask = gfp_mask,
...@@ -1410,7 +1413,7 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) ...@@ -1410,7 +1413,7 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
.isolate_pages = isolate_pages_global, .isolate_pages = isolate_pages_global,
}; };
return do_try_to_free_pages(zones, gfp_mask, &sc); return do_try_to_free_pages(zonelist, gfp_mask, &sc);
} }
#ifdef CONFIG_CGROUP_MEM_RES_CTLR #ifdef CONFIG_CGROUP_MEM_RES_CTLR
...@@ -1428,11 +1431,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, ...@@ -1428,11 +1431,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
.mem_cgroup = mem_cont, .mem_cgroup = mem_cont,
.isolate_pages = mem_cgroup_isolate_pages, .isolate_pages = mem_cgroup_isolate_pages,
}; };
struct zone **zones; struct zonelist *zonelist;
int target_zone = gfp_zone(GFP_HIGHUSER_MOVABLE); int target_zone = gfp_zone(GFP_HIGHUSER_MOVABLE);
zones = NODE_DATA(numa_node_id())->node_zonelists[target_zone].zones; zonelist = &NODE_DATA(numa_node_id())->node_zonelists[target_zone];
if (do_try_to_free_pages(zones, sc.gfp_mask, &sc)) if (do_try_to_free_pages(zonelist, sc.gfp_mask, &sc))
return 1; return 1;
return 0; return 0;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册