memcg: fix hotplugged memory zone oops

When MEMCG is configured on (even when it's disabled by boot option), when adding or removing a page to/from its lru list, the zone pointer used for stats updates is nowadays taken from the struct lruvec. (On many configurations, calculating zone from page is slower.) But we have no code to update all the lruvecs (per zone, per memcg) when a memory node is hotadded. Here's an extract from the oops which results when running numactl to bind a program to a newly onlined node: BUG: unable to handle kernel NULL pointer dereference at 0000000000000f60 IP: __mod_zone_page_state+0x9/0x60 Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180 Bochs Bochs Process numactl (pid: 1219, threadinfo ffff880039abc000, task ffff8800383c4ce0) Call Trace: __pagevec_lru_add_fn+0xdf/0x140 pagevec_lru_move_fn+0xb1/0x100 __pagevec_lru_add+0x1c/0x30 lru_add_drain_cpu+0xa3/0x130 lru_add_drain+0x2f/0x40 ... The natural solution might be to use a memcg callback whenever memory is hotadded; but that solution has not been scoped out, and it happens that we do have an easy location at which to update lruvec->zone. The lruvec pointer is discovered either by mem_cgroup_zone_lruvec() or by mem_cgroup_page_lruvec(), and both of those do know the right zone. So check and set lruvec->zone in those; and remove the inadequate attempt to set lruvec->zone from lruvec_init(), which is called before NODE_DATA(node) has been allocated in such cases. Ah, there was one exceptionr. For no particularly good reason, mem_cgroup_force_empty_list() has its own code for deciding lruvec. Change it to use the standard mem_cgroup_zone_lruvec() and mem_cgroup_get_lru_size() too. In fact it was already safe against such an oops (the lru lists in danger could only be empty), but we're better proofed against future changes this way. I've marked this for stable (3.6) since we introduced the problem in 3.5 (now closed to stable); but I have no idea if this is the only fix needed to get memory hotadd working with memcg in 3.6, and received no answer when I enquired twice before. Reported-by: N Tang Chen <tangchen@cn.fujitsu.com> Signed-off-by: N Hugh Dickins <hughd@google.com> Acked-by: N Johannes Weiner <hannes@cmpxchg.org> Acked-by: N KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Konstantin Khlebnikov <khlebnikov@openvz.org> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: <stable@vger.kernel.org> Signed-off-by: N Andrew Morton <akpm@linux-foundation.org> Signed-off-by: N Linus Torvalds <torvalds@linux-foundation.org>

memcg: fix hotplugged memory zone oops
When MEMCG is configured on (even when it's disabled by boot option), when adding or removing a page to/from its lru list, the zone pointer used for stats updates is nowadays taken from the struct lruvec. (On many configurations, calculating zone from page is slower.) But we have no code to update all the lruvecs (per zone, per memcg) when a memory node is hotadded. Here's an extract from the oops which results when running numactl to bind a program to a newly onlined node: BUG: unable to handle kernel NULL pointer dereference at 0000000000000f60 IP: __mod_zone_page_state+0x9/0x60 Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180 Bochs Bochs Process numactl (pid: 1219, threadinfo ffff880039abc000, task ffff8800383c4ce0) Call Trace: __pagevec_lru_add_fn+0xdf/0x140 pagevec_lru_move_fn+0xb1/0x100 __pagevec_lru_add+0x1c/0x30 lru_add_drain_cpu+0xa3/0x130 lru_add_drain+0x2f/0x40 ... The natural solution might be to use a memcg callback whenever memory is hotadded; but that solution has not been scoped out, and it happens that we do have an easy location at which to update lruvec->zone. The lruvec pointer is discovered either by mem_cgroup_zone_lruvec() or by mem_cgroup_page_lruvec(), and both of those do know the right zone. So check and set lruvec->zone in those; and remove the inadequate attempt to set lruvec->zone from lruvec_init(), which is called before NODE_DATA(node) has been allocated in such cases. Ah, there was one exceptionr. For no particularly good reason, mem_cgroup_force_empty_list() has its own code for deciding lruvec. Change it to use the standard mem_cgroup_zone_lruvec() and mem_cgroup_get_lru_size() too. In fact it was already safe against such an oops (the lru lists in danger could only be empty), but we're better proofed against future changes this way. I've marked this for stable (3.6) since we introduced the problem in 3.5 (now closed to stable); but I have no idea if this is the only fix needed to get memory hotadd working with memcg in 3.6, and received no answer when I enquired twice before. Reported-by: N Tang Chen <tangchen@cn.fujitsu.com> Signed-off-by: N Hugh Dickins <hughd@google.com> Acked-by: N Johannes Weiner <hannes@cmpxchg.org> Acked-by: N KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Konstantin Khlebnikov <khlebnikov@openvz.org> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: <stable@vger.kernel.org> Signed-off-by: N Andrew Morton <akpm@linux-foundation.org> Signed-off-by: N Linus Torvalds <torvalds@linux-foundation.org>
bea8c150 · Hugh Dickins · Linus Torvalds · 18f69427 · bea8c150 · bea8c150
显示空白变更内容
内联并排

Showing with 38 addition and 18 deletion

include/linux/mmzone.h include/linux/mmzone.h +1 -1

mm/memcontrol.c mm/memcontrol.c +35 -11

mm/mmzone.c mm/mmzone.c +1 -5

mm/page_alloc.c mm/page_alloc.c +1 -1

未找到文件。
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -752,7 +752,7 @@ extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
 				     unsigned long size,
 				     enum memmap_context context);
-extern void lruvec_init(struct lruvec *lruvec, struct zone *zone);
+extern void lruvec_init(struct lruvec *lruvec);
 static inline struct zone *lruvec_zone(struct lruvec *lruvec)
 {

--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1055,12 +1055,24 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
 				      struct mem_cgroup *memcg)
 {
 	struct mem_cgroup_per_zone *mz;
+	struct lruvec *lruvec;
-	if (mem_cgroup_disabled())
+	if (mem_cgroup_disabled()) {
-		return &zone->lruvec;
+		lruvec = &zone->lruvec;
+		goto out;
+	}
 	mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
-	return &mz->lruvec;
+	lruvec = &mz->lruvec;
+out:
+	/*
+	 * Since a node can be onlined after the mem_cgroup was created,
+	 * we have to be prepared to initialize lruvec->zone here;
+	 * and if offlined then reonlined, we need to reinitialize it.
+	 */
+	if (unlikely(lruvec->zone != zone))
+		lruvec->zone = zone;
+	return lruvec;
 }
 /*
@@ -1087,9 +1099,12 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
 	struct mem_cgroup_per_zone *mz;
 	struct mem_cgroup *memcg;
 	struct page_cgroup *pc;
+	struct lruvec *lruvec;
-	if (mem_cgroup_disabled())
+	if (mem_cgroup_disabled()) {
-		return &zone->lruvec;
+		lruvec = &zone->lruvec;
+		goto out;
+	}
 	pc = lookup_page_cgroup(page);
 	memcg = pc->mem_cgroup;
@@ -1107,7 +1122,16 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
 		pc->mem_cgroup = memcg = root_mem_cgroup;
 	mz = page_cgroup_zoneinfo(memcg, page);
-	return &mz->lruvec;
+	lruvec = &mz->lruvec;
+out:
+	/*
+	 * Since a node can be onlined after the mem_cgroup was created,
+	 * we have to be prepared to initialize lruvec->zone here;
+	 * and if offlined then reonlined, we need to reinitialize it.
+	 */
+	if (unlikely(lruvec->zone != zone))
+		lruvec->zone = zone;
+	return lruvec;
 }
 /**
@@ -3697,17 +3721,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
 				int node, int zid, enum lru_list lru)
 {
-	struct mem_cgroup_per_zone *mz;
+	struct lruvec *lruvec;
 	unsigned long flags, loop;
 	struct list_head *list;
 	struct page *busy;
 	struct zone *zone;
 	zone = &NODE_DATA(node)->node_zones[zid];
-	mz = mem_cgroup_zoneinfo(memcg, node, zid);
+	lruvec = mem_cgroup_zone_lruvec(zone, memcg);
-	list = &mz->lruvec.lists[lru];
+	list = &lruvec->lists[lru];
-	loop = mz->lru_size[lru];
+	loop = mem_cgroup_get_lru_size(lruvec, lru);
 	/* give some margin against EBUSY etc...*/
 	loop += 256;
 	busy = NULL;
@@ -4745,7 +4769,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 		mz = &pn->zoneinfo[zone];
-		lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
+		lruvec_init(&mz->lruvec);
 		mz->usage_in_excess = 0;
 		mz->on_tree = false;
 		mz->memcg = memcg;

--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,7 +87,7 @@ int memmap_valid_within(unsigned long pfn,
 }
 #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
-void lruvec_init(struct lruvec *lruvec, struct zone *zone)
+void lruvec_init(struct lruvec *lruvec)
 {
 	enum lru_list lru;
@@ -95,8 +95,4 @@ void lruvec_init(struct lruvec *lruvec, struct zone *zone)
 	for_each_lru(lru)
 		INIT_LIST_HEAD(&lruvec->lists[lru]);
-#ifdef CONFIG_MEMCG
-	lruvec->zone = zone;
-#endif
 }
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4505,7 +4505,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		zone->zone_pgdat = pgdat;
 		zone_pcp_init(zone);
-		lruvec_init(&zone->lruvec, zone);
+		lruvec_init(&zone->lruvec);
 		if (!size)
 			continue;