提交 08e552c6 编写于 作者: K KAMEZAWA Hiroyuki 提交者: Linus Torvalds

memcg: synchronized LRU

A big patch for changing memcg's LRU semantics.

Now,
  - page_cgroup is linked to mem_cgroup's its own LRU (per zone).

  - LRU of page_cgroup is not synchronous with global LRU.

  - page and page_cgroup is one-to-one and statically allocated.

  - To find page_cgroup is on what LRU, you have to check pc->mem_cgroup as
    - lru = page_cgroup_zoneinfo(pc, nid_of_pc, zid_of_pc);

  - SwapCache is handled.

And, when we handle LRU list of page_cgroup, we do following.

	pc = lookup_page_cgroup(page);
	lock_page_cgroup(pc); .....................(1)
	mz = page_cgroup_zoneinfo(pc);
	spin_lock(&mz->lru_lock);
	.....add to LRU
	spin_unlock(&mz->lru_lock);
	unlock_page_cgroup(pc);

But (1) is spin_lock and we have to be afraid of dead-lock with zone->lru_lock.
So, trylock() is used at (1), now. Without (1), we can't trust "mz" is correct.

This is a trial to remove this dirty nesting of locks.
This patch changes mz->lru_lock to be zone->lru_lock.
Then, above sequence will be written as

        spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
	mem_cgroup_add/remove/etc_lru() {
		pc = lookup_page_cgroup(page);
		mz = page_cgroup_zoneinfo(pc);
		if (PageCgroupUsed(pc)) {
			....add to LRU
		}
        spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU

This is much simpler.
(*) We're safe even if we don't take lock_page_cgroup(pc). Because..
    1. When pc->mem_cgroup can be modified.
       - at charge.
       - at account_move().
    2. at charge
       the PCG_USED bit is not set before pc->mem_cgroup is fixed.
    3. at account_move()
       the page is isolated and not on LRU.

Pros.
  - easy for maintenance.
  - memcg can make use of laziness of pagevec.
  - we don't have to duplicated LRU/Active/Unevictable bit in page_cgroup.
  - LRU status of memcg will be synchronized with global LRU's one.
  - # of locks are reduced.
  - account_move() is simplified very much.
Cons.
  - may increase cost of LRU rotation.
    (no impact if memcg is not configured.)
Signed-off-by: NKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
上级 8c7c6e34
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include <linux/file.h> #include <linux/file.h>
#include <linux/pagemap.h> #include <linux/pagemap.h>
#include <linux/splice.h> #include <linux/splice.h>
#include <linux/memcontrol.h>
#include <linux/mm_inline.h> #include <linux/mm_inline.h>
#include <linux/swap.h> #include <linux/swap.h>
#include <linux/writeback.h> #include <linux/writeback.h>
......
...@@ -40,7 +40,12 @@ extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr); ...@@ -40,7 +40,12 @@ extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr);
extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask); gfp_t gfp_mask);
extern void mem_cgroup_move_lists(struct page *page, enum lru_list lru); extern void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru);
extern void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru);
extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru);
extern void mem_cgroup_del_lru(struct page *page);
extern void mem_cgroup_move_lists(struct page *page,
enum lru_list from, enum lru_list to);
extern void mem_cgroup_uncharge_page(struct page *page); extern void mem_cgroup_uncharge_page(struct page *page);
extern void mem_cgroup_uncharge_cache_page(struct page *page); extern void mem_cgroup_uncharge_cache_page(struct page *page);
extern int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask); extern int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask);
...@@ -131,7 +136,27 @@ static inline int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) ...@@ -131,7 +136,27 @@ static inline int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
return 0; return 0;
} }
static inline void mem_cgroup_move_lists(struct page *page, bool active) static inline void mem_cgroup_add_lru_list(struct page *page, int lru)
{
}
static inline void mem_cgroup_del_lru_list(struct page *page, int lru)
{
return ;
}
static inline void mem_cgroup_rotate_lru_list(struct page *page, int lru)
{
return ;
}
static inline void mem_cgroup_del_lru(struct page *page)
{
return ;
}
static inline void
mem_cgroup_move_lists(struct page *page, enum lru_list from, enum lru_list to)
{ {
} }
......
...@@ -28,6 +28,7 @@ add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) ...@@ -28,6 +28,7 @@ add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l)
{ {
list_add(&page->lru, &zone->lru[l].list); list_add(&page->lru, &zone->lru[l].list);
__inc_zone_state(zone, NR_LRU_BASE + l); __inc_zone_state(zone, NR_LRU_BASE + l);
mem_cgroup_add_lru_list(page, l);
} }
static inline void static inline void
...@@ -35,6 +36,7 @@ del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) ...@@ -35,6 +36,7 @@ del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l)
{ {
list_del(&page->lru); list_del(&page->lru);
__dec_zone_state(zone, NR_LRU_BASE + l); __dec_zone_state(zone, NR_LRU_BASE + l);
mem_cgroup_del_lru_list(page, l);
} }
static inline void static inline void
...@@ -54,6 +56,7 @@ del_page_from_lru(struct zone *zone, struct page *page) ...@@ -54,6 +56,7 @@ del_page_from_lru(struct zone *zone, struct page *page)
l += page_is_file_cache(page); l += page_is_file_cache(page);
} }
__dec_zone_state(zone, NR_LRU_BASE + l); __dec_zone_state(zone, NR_LRU_BASE + l);
mem_cgroup_del_lru_list(page, l);
} }
/** /**
......
...@@ -26,10 +26,6 @@ enum { ...@@ -26,10 +26,6 @@ enum {
PCG_LOCK, /* page cgroup is locked */ PCG_LOCK, /* page cgroup is locked */
PCG_CACHE, /* charged as cache */ PCG_CACHE, /* charged as cache */
PCG_USED, /* this object is in use. */ PCG_USED, /* this object is in use. */
/* flags for LRU placement */
PCG_ACTIVE, /* page is active in this cgroup */
PCG_FILE, /* page is file system backed */
PCG_UNEVICTABLE, /* page is unevictableable */
}; };
#define TESTPCGFLAG(uname, lname) \ #define TESTPCGFLAG(uname, lname) \
...@@ -50,19 +46,6 @@ TESTPCGFLAG(Cache, CACHE) ...@@ -50,19 +46,6 @@ TESTPCGFLAG(Cache, CACHE)
TESTPCGFLAG(Used, USED) TESTPCGFLAG(Used, USED)
CLEARPCGFLAG(Used, USED) CLEARPCGFLAG(Used, USED)
/* LRU management flags (from global-lru definition) */
TESTPCGFLAG(File, FILE)
SETPCGFLAG(File, FILE)
CLEARPCGFLAG(File, FILE)
TESTPCGFLAG(Active, ACTIVE)
SETPCGFLAG(Active, ACTIVE)
CLEARPCGFLAG(Active, ACTIVE)
TESTPCGFLAG(Unevictable, UNEVICTABLE)
SETPCGFLAG(Unevictable, UNEVICTABLE)
CLEARPCGFLAG(Unevictable, UNEVICTABLE)
static inline int page_cgroup_nid(struct page_cgroup *pc) static inline int page_cgroup_nid(struct page_cgroup *pc)
{ {
return page_to_nid(pc->page); return page_to_nid(pc->page);
......
...@@ -36,6 +36,7 @@ ...@@ -36,6 +36,7 @@
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include <linux/mm_inline.h> #include <linux/mm_inline.h>
#include <linux/page_cgroup.h> #include <linux/page_cgroup.h>
#include "internal.h"
#include <asm/uaccess.h> #include <asm/uaccess.h>
...@@ -100,7 +101,6 @@ struct mem_cgroup_per_zone { ...@@ -100,7 +101,6 @@ struct mem_cgroup_per_zone {
/* /*
* spin_lock to protect the per cgroup LRU * spin_lock to protect the per cgroup LRU
*/ */
spinlock_t lru_lock;
struct list_head lists[NR_LRU_LISTS]; struct list_head lists[NR_LRU_LISTS];
unsigned long count[NR_LRU_LISTS]; unsigned long count[NR_LRU_LISTS];
}; };
...@@ -163,14 +163,12 @@ enum charge_type { ...@@ -163,14 +163,12 @@ enum charge_type {
/* only for here (for easy reading.) */ /* only for here (for easy reading.) */
#define PCGF_CACHE (1UL << PCG_CACHE) #define PCGF_CACHE (1UL << PCG_CACHE)
#define PCGF_USED (1UL << PCG_USED) #define PCGF_USED (1UL << PCG_USED)
#define PCGF_ACTIVE (1UL << PCG_ACTIVE)
#define PCGF_LOCK (1UL << PCG_LOCK) #define PCGF_LOCK (1UL << PCG_LOCK)
#define PCGF_FILE (1UL << PCG_FILE)
static const unsigned long static const unsigned long
pcg_default_flags[NR_CHARGE_TYPE] = { pcg_default_flags[NR_CHARGE_TYPE] = {
PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */ PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */ PCGF_USED | PCGF_LOCK, /* Anon */
PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
0, /* FORCE */ 0, /* FORCE */
}; };
...@@ -185,9 +183,6 @@ pcg_default_flags[NR_CHARGE_TYPE] = { ...@@ -185,9 +183,6 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
static void mem_cgroup_get(struct mem_cgroup *mem); static void mem_cgroup_get(struct mem_cgroup *mem);
static void mem_cgroup_put(struct mem_cgroup *mem); static void mem_cgroup_put(struct mem_cgroup *mem);
/*
* Always modified under lru lock. Then, not necessary to preempt_disable()
*/
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
struct page_cgroup *pc, struct page_cgroup *pc,
bool charge) bool charge)
...@@ -195,10 +190,9 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, ...@@ -195,10 +190,9 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
int val = (charge)? 1 : -1; int val = (charge)? 1 : -1;
struct mem_cgroup_stat *stat = &mem->stat; struct mem_cgroup_stat *stat = &mem->stat;
struct mem_cgroup_stat_cpu *cpustat; struct mem_cgroup_stat_cpu *cpustat;
int cpu = get_cpu();
VM_BUG_ON(!irqs_disabled()); cpustat = &stat->cpustat[cpu];
cpustat = &stat->cpustat[smp_processor_id()];
if (PageCgroupCache(pc)) if (PageCgroupCache(pc))
__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
else else
...@@ -210,6 +204,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, ...@@ -210,6 +204,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
else else
__mem_cgroup_stat_add_safe(cpustat, __mem_cgroup_stat_add_safe(cpustat,
MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
put_cpu();
} }
static struct mem_cgroup_per_zone * static struct mem_cgroup_per_zone *
...@@ -264,80 +259,95 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) ...@@ -264,80 +259,95 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
struct mem_cgroup, css); struct mem_cgroup, css);
} }
static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, /*
struct page_cgroup *pc) * Following LRU functions are allowed to be used without PCG_LOCK.
{ * Operations are called by routine of global LRU independently from memcg.
int lru = LRU_BASE; * What we have to take care of here is validness of pc->mem_cgroup.
*
* Changes to pc->mem_cgroup happens when
* 1. charge
* 2. moving account
* In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
* It is added to LRU before charge.
* If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
* When moving account, the page is not on LRU. It's isolated.
*/
if (PageCgroupUnevictable(pc)) void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
lru = LRU_UNEVICTABLE; {
else { struct page_cgroup *pc;
if (PageCgroupActive(pc)) struct mem_cgroup *mem;
lru += LRU_ACTIVE; struct mem_cgroup_per_zone *mz;
if (PageCgroupFile(pc))
lru += LRU_FILE;
}
if (mem_cgroup_subsys.disabled)
return;
pc = lookup_page_cgroup(page);
/* can happen while we handle swapcache. */
if (list_empty(&pc->lru))
return;
mz = page_cgroup_zoneinfo(pc);
mem = pc->mem_cgroup;
MEM_CGROUP_ZSTAT(mz, lru) -= 1; MEM_CGROUP_ZSTAT(mz, lru) -= 1;
list_del_init(&pc->lru);
mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false); return;
list_del(&pc->lru);
} }
static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, void mem_cgroup_del_lru(struct page *page)
struct page_cgroup *pc, bool hot)
{ {
int lru = LRU_BASE; mem_cgroup_del_lru_list(page, page_lru(page));
}
if (PageCgroupUnevictable(pc)) void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
lru = LRU_UNEVICTABLE; {
else { struct mem_cgroup_per_zone *mz;
if (PageCgroupActive(pc)) struct page_cgroup *pc;
lru += LRU_ACTIVE;
if (PageCgroupFile(pc))
lru += LRU_FILE;
}
MEM_CGROUP_ZSTAT(mz, lru) += 1; if (mem_cgroup_subsys.disabled)
if (hot) return;
list_add(&pc->lru, &mz->lists[lru]);
else
list_add_tail(&pc->lru, &mz->lists[lru]);
mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true); pc = lookup_page_cgroup(page);
smp_rmb();
/* unused page is not rotated. */
if (!PageCgroupUsed(pc))
return;
mz = page_cgroup_zoneinfo(pc);
list_move(&pc->lru, &mz->lists[lru]);
} }
static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru) void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
{ {
struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); struct page_cgroup *pc;
int active = PageCgroupActive(pc); struct mem_cgroup_per_zone *mz;
int file = PageCgroupFile(pc);
int unevictable = PageCgroupUnevictable(pc);
enum lru_list from = unevictable ? LRU_UNEVICTABLE :
(LRU_FILE * !!file + !!active);
if (lru == from) if (mem_cgroup_subsys.disabled)
return;
pc = lookup_page_cgroup(page);
/* barrier to sync with "charge" */
smp_rmb();
if (!PageCgroupUsed(pc))
return; return;
MEM_CGROUP_ZSTAT(mz, from) -= 1; mz = page_cgroup_zoneinfo(pc);
/*
* However this is done under mz->lru_lock, another flags, which
* are not related to LRU, will be modified from out-of-lock.
* We have to use atomic set/clear flags.
*/
if (is_unevictable_lru(lru)) {
ClearPageCgroupActive(pc);
SetPageCgroupUnevictable(pc);
} else {
if (is_active_lru(lru))
SetPageCgroupActive(pc);
else
ClearPageCgroupActive(pc);
ClearPageCgroupUnevictable(pc);
}
MEM_CGROUP_ZSTAT(mz, lru) += 1; MEM_CGROUP_ZSTAT(mz, lru) += 1;
list_move(&pc->lru, &mz->lists[lru]); list_add(&pc->lru, &mz->lists[lru]);
}
/*
* To add swapcache into LRU. Be careful to all this function.
* zone->lru_lock shouldn't be held and irq must not be disabled.
*/
static void mem_cgroup_lru_fixup(struct page *page)
{
if (!isolate_lru_page(page))
putback_lru_page(page);
}
void mem_cgroup_move_lists(struct page *page,
enum lru_list from, enum lru_list to)
{
if (mem_cgroup_subsys.disabled)
return;
mem_cgroup_del_lru_list(page, from);
mem_cgroup_add_lru_list(page, to);
} }
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
...@@ -350,37 +360,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) ...@@ -350,37 +360,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
return ret; return ret;
} }
/*
* This routine assumes that the appropriate zone's lru lock is already held
*/
void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
{
struct page_cgroup *pc;
struct mem_cgroup_per_zone *mz;
unsigned long flags;
if (mem_cgroup_subsys.disabled)
return;
/*
* We cannot lock_page_cgroup while holding zone's lru_lock,
* because other holders of lock_page_cgroup can be interrupted
* with an attempt to rotate_reclaimable_page. But we cannot
* safely get to page_cgroup without it, so just try_lock it:
* mem_cgroup_isolate_pages allows for page left on wrong list.
*/
pc = lookup_page_cgroup(page);
if (!trylock_page_cgroup(pc))
return;
if (pc && PageCgroupUsed(pc)) {
mz = page_cgroup_zoneinfo(pc);
spin_lock_irqsave(&mz->lru_lock, flags);
__mem_cgroup_move_lists(pc, lru);
spin_unlock_irqrestore(&mz->lru_lock, flags);
}
unlock_page_cgroup(pc);
}
/* /*
* Calculate mapped_ratio under memory controller. This will be used in * Calculate mapped_ratio under memory controller. This will be used in
* vmscan.c for deteremining we have to reclaim mapped pages. * vmscan.c for deteremining we have to reclaim mapped pages.
...@@ -460,40 +439,24 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, ...@@ -460,40 +439,24 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
src = &mz->lists[lru]; src = &mz->lists[lru];
spin_lock(&mz->lru_lock);
scan = 0; scan = 0;
list_for_each_entry_safe_reverse(pc, tmp, src, lru) { list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
if (scan >= nr_to_scan) if (scan >= nr_to_scan)
break; break;
page = pc->page;
if (unlikely(!PageCgroupUsed(pc))) if (unlikely(!PageCgroupUsed(pc)))
continue; continue;
page = pc->page;
if (unlikely(!PageLRU(page))) if (unlikely(!PageLRU(page)))
continue; continue;
/*
* TODO: play better with lumpy reclaim, grabbing anything.
*/
if (PageUnevictable(page) ||
(PageActive(page) && !active) ||
(!PageActive(page) && active)) {
__mem_cgroup_move_lists(pc, page_lru(page));
continue;
}
scan++; scan++;
list_move(&pc->lru, &pc_list);
if (__isolate_lru_page(page, mode, file) == 0) { if (__isolate_lru_page(page, mode, file) == 0) {
list_move(&page->lru, dst); list_move(&page->lru, dst);
nr_taken++; nr_taken++;
} }
} }
list_splice(&pc_list, src);
spin_unlock(&mz->lru_lock);
*scanned = scan; *scanned = scan;
return nr_taken; return nr_taken;
} }
...@@ -608,9 +571,6 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, ...@@ -608,9 +571,6 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
struct page_cgroup *pc, struct page_cgroup *pc,
enum charge_type ctype) enum charge_type ctype)
{ {
struct mem_cgroup_per_zone *mz;
unsigned long flags;
/* try_charge() can return NULL to *memcg, taking care of it. */ /* try_charge() can return NULL to *memcg, taking care of it. */
if (!mem) if (!mem)
return; return;
...@@ -625,17 +585,11 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, ...@@ -625,17 +585,11 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
return; return;
} }
pc->mem_cgroup = mem; pc->mem_cgroup = mem;
/* smp_wmb();
* If a page is accounted as a page cache, insert to inactive list.
* If anon, insert to active list.
*/
pc->flags = pcg_default_flags[ctype]; pc->flags = pcg_default_flags[ctype];
mz = page_cgroup_zoneinfo(pc); mem_cgroup_charge_statistics(mem, pc, true);
spin_lock_irqsave(&mz->lru_lock, flags);
__mem_cgroup_add_list(mz, pc, true);
spin_unlock_irqrestore(&mz->lru_lock, flags);
unlock_page_cgroup(pc); unlock_page_cgroup(pc);
} }
...@@ -646,8 +600,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, ...@@ -646,8 +600,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
* @to: mem_cgroup which the page is moved to. @from != @to. * @to: mem_cgroup which the page is moved to. @from != @to.
* *
* The caller must confirm following. * The caller must confirm following.
* 1. disable irq. * - page is not on LRU (isolate_page() is useful.)
* 2. lru_lock of old mem_cgroup(@from) should be held.
* *
* returns 0 at success, * returns 0 at success,
* returns -EBUSY when lock is busy or "pc" is unstable. * returns -EBUSY when lock is busy or "pc" is unstable.
...@@ -663,15 +616,14 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, ...@@ -663,15 +616,14 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
int nid, zid; int nid, zid;
int ret = -EBUSY; int ret = -EBUSY;
VM_BUG_ON(!irqs_disabled());
VM_BUG_ON(from == to); VM_BUG_ON(from == to);
VM_BUG_ON(PageLRU(pc->page));
nid = page_cgroup_nid(pc); nid = page_cgroup_nid(pc);
zid = page_cgroup_zid(pc); zid = page_cgroup_zid(pc);
from_mz = mem_cgroup_zoneinfo(from, nid, zid); from_mz = mem_cgroup_zoneinfo(from, nid, zid);
to_mz = mem_cgroup_zoneinfo(to, nid, zid); to_mz = mem_cgroup_zoneinfo(to, nid, zid);
if (!trylock_page_cgroup(pc)) if (!trylock_page_cgroup(pc))
return ret; return ret;
...@@ -681,18 +633,15 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, ...@@ -681,18 +633,15 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
if (pc->mem_cgroup != from) if (pc->mem_cgroup != from)
goto out; goto out;
if (spin_trylock(&to_mz->lru_lock)) { css_put(&from->css);
__mem_cgroup_remove_list(from_mz, pc); res_counter_uncharge(&from->res, PAGE_SIZE);
css_put(&from->css); mem_cgroup_charge_statistics(from, pc, false);
res_counter_uncharge(&from->res, PAGE_SIZE); if (do_swap_account)
if (do_swap_account) res_counter_uncharge(&from->memsw, PAGE_SIZE);
res_counter_uncharge(&from->memsw, PAGE_SIZE); pc->mem_cgroup = to;
pc->mem_cgroup = to; mem_cgroup_charge_statistics(to, pc, true);
css_get(&to->css); css_get(&to->css);
__mem_cgroup_add_list(to_mz, pc, false); ret = 0;
ret = 0;
spin_unlock(&to_mz->lru_lock);
}
out: out:
unlock_page_cgroup(pc); unlock_page_cgroup(pc);
return ret; return ret;
...@@ -706,39 +655,47 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, ...@@ -706,39 +655,47 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
struct mem_cgroup *child, struct mem_cgroup *child,
gfp_t gfp_mask) gfp_t gfp_mask)
{ {
struct page *page = pc->page;
struct cgroup *cg = child->css.cgroup; struct cgroup *cg = child->css.cgroup;
struct cgroup *pcg = cg->parent; struct cgroup *pcg = cg->parent;
struct mem_cgroup *parent; struct mem_cgroup *parent;
struct mem_cgroup_per_zone *mz;
unsigned long flags;
int ret; int ret;
/* Is ROOT ? */ /* Is ROOT ? */
if (!pcg) if (!pcg)
return -EINVAL; return -EINVAL;
parent = mem_cgroup_from_cont(pcg); parent = mem_cgroup_from_cont(pcg);
ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
if (ret) if (ret)
return ret; return ret;
mz = mem_cgroup_zoneinfo(child, if (!get_page_unless_zero(page))
page_cgroup_nid(pc), page_cgroup_zid(pc)); return -EBUSY;
ret = isolate_lru_page(page);
if (ret)
goto cancel;
spin_lock_irqsave(&mz->lru_lock, flags);
ret = mem_cgroup_move_account(pc, child, parent); ret = mem_cgroup_move_account(pc, child, parent);
spin_unlock_irqrestore(&mz->lru_lock, flags);
/* drop extra refcnt */ /* drop extra refcnt by try_charge() (move_account increment one) */
css_put(&parent->css); css_put(&parent->css);
/* uncharge if move fails */ putback_lru_page(page);
if (ret) { if (!ret) {
res_counter_uncharge(&parent->res, PAGE_SIZE); put_page(page);
if (do_swap_account) return 0;
res_counter_uncharge(&parent->memsw, PAGE_SIZE);
} }
/* uncharge if move fails */
cancel:
res_counter_uncharge(&parent->res, PAGE_SIZE);
if (do_swap_account)
res_counter_uncharge(&parent->memsw, PAGE_SIZE);
put_page(page);
return ret; return ret;
} }
...@@ -912,6 +869,8 @@ int mem_cgroup_cache_charge_swapin(struct page *page, ...@@ -912,6 +869,8 @@ int mem_cgroup_cache_charge_swapin(struct page *page,
} }
if (!locked) if (!locked)
unlock_page(page); unlock_page(page);
/* add this page(page_cgroup) to the LRU we want. */
mem_cgroup_lru_fixup(page);
return ret; return ret;
} }
...@@ -944,6 +903,8 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) ...@@ -944,6 +903,8 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
} }
} }
/* add this page(page_cgroup) to the LRU we want. */
mem_cgroup_lru_fixup(page);
} }
void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
...@@ -968,7 +929,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) ...@@ -968,7 +929,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
struct page_cgroup *pc; struct page_cgroup *pc;
struct mem_cgroup *mem = NULL; struct mem_cgroup *mem = NULL;
struct mem_cgroup_per_zone *mz; struct mem_cgroup_per_zone *mz;
unsigned long flags;
if (mem_cgroup_subsys.disabled) if (mem_cgroup_subsys.disabled)
return NULL; return NULL;
...@@ -1010,12 +970,10 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) ...@@ -1010,12 +970,10 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
res_counter_uncharge(&mem->memsw, PAGE_SIZE); res_counter_uncharge(&mem->memsw, PAGE_SIZE);
mem_cgroup_charge_statistics(mem, pc, false);
ClearPageCgroupUsed(pc); ClearPageCgroupUsed(pc);
mz = page_cgroup_zoneinfo(pc); mz = page_cgroup_zoneinfo(pc);
spin_lock_irqsave(&mz->lru_lock, flags);
__mem_cgroup_remove_list(mz, pc);
spin_unlock_irqrestore(&mz->lru_lock, flags);
unlock_page_cgroup(pc); unlock_page_cgroup(pc);
css_put(&mem->css); css_put(&mem->css);
...@@ -1281,21 +1239,22 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, ...@@ -1281,21 +1239,22 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
return ret; return ret;
} }
/* /*
* This routine traverse page_cgroup in given list and drop them all. * This routine traverse page_cgroup in given list and drop them all.
* *And* this routine doesn't reclaim page itself, just removes page_cgroup. * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
*/ */
static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
struct mem_cgroup_per_zone *mz, int node, int zid, enum lru_list lru)
enum lru_list lru)
{ {
struct zone *zone;
struct mem_cgroup_per_zone *mz;
struct page_cgroup *pc, *busy; struct page_cgroup *pc, *busy;
unsigned long flags; unsigned long flags, loop;
unsigned long loop;
struct list_head *list; struct list_head *list;
int ret = 0; int ret = 0;
zone = &NODE_DATA(node)->node_zones[zid];
mz = mem_cgroup_zoneinfo(mem, node, zid);
list = &mz->lists[lru]; list = &mz->lists[lru];
loop = MEM_CGROUP_ZSTAT(mz, lru); loop = MEM_CGROUP_ZSTAT(mz, lru);
...@@ -1304,19 +1263,19 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, ...@@ -1304,19 +1263,19 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
busy = NULL; busy = NULL;
while (loop--) { while (loop--) {
ret = 0; ret = 0;
spin_lock_irqsave(&mz->lru_lock, flags); spin_lock_irqsave(&zone->lru_lock, flags);
if (list_empty(list)) { if (list_empty(list)) {
spin_unlock_irqrestore(&mz->lru_lock, flags); spin_unlock_irqrestore(&zone->lru_lock, flags);
break; break;
} }
pc = list_entry(list->prev, struct page_cgroup, lru); pc = list_entry(list->prev, struct page_cgroup, lru);
if (busy == pc) { if (busy == pc) {
list_move(&pc->lru, list); list_move(&pc->lru, list);
busy = 0; busy = 0;
spin_unlock_irqrestore(&mz->lru_lock, flags); spin_unlock_irqrestore(&zone->lru_lock, flags);
continue; continue;
} }
spin_unlock_irqrestore(&mz->lru_lock, flags); spin_unlock_irqrestore(&zone->lru_lock, flags);
ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE); ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE);
if (ret == -ENOMEM) if (ret == -ENOMEM)
...@@ -1329,6 +1288,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, ...@@ -1329,6 +1288,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
} else } else
busy = NULL; busy = NULL;
} }
if (!ret && !list_empty(list)) if (!ret && !list_empty(list))
return -EBUSY; return -EBUSY;
return ret; return ret;
...@@ -1364,12 +1324,10 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) ...@@ -1364,12 +1324,10 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
ret = 0; ret = 0;
for_each_node_state(node, N_POSSIBLE) { for_each_node_state(node, N_POSSIBLE) {
for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
struct mem_cgroup_per_zone *mz;
enum lru_list l; enum lru_list l;
mz = mem_cgroup_zoneinfo(mem, node, zid);
for_each_lru(l) { for_each_lru(l) {
ret = mem_cgroup_force_empty_list(mem, ret = mem_cgroup_force_empty_list(mem,
mz, l); node, zid, l);
if (ret) if (ret)
break; break;
} }
...@@ -1413,6 +1371,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) ...@@ -1413,6 +1371,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
} }
} }
lru_add_drain();
/* try move_account...there may be some *locked* pages. */ /* try move_account...there may be some *locked* pages. */
if (mem->res.usage) if (mem->res.usage)
goto move_account; goto move_account;
...@@ -1657,7 +1616,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) ...@@ -1657,7 +1616,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
for (zone = 0; zone < MAX_NR_ZONES; zone++) { for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = &pn->zoneinfo[zone]; mz = &pn->zoneinfo[zone];
spin_lock_init(&mz->lru_lock);
for_each_lru(l) for_each_lru(l)
INIT_LIST_HEAD(&mz->lists[l]); INIT_LIST_HEAD(&mz->lists[l]);
} }
...@@ -1706,8 +1664,15 @@ static struct mem_cgroup *mem_cgroup_alloc(void) ...@@ -1706,8 +1664,15 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
static void mem_cgroup_free(struct mem_cgroup *mem) static void mem_cgroup_free(struct mem_cgroup *mem)
{ {
int node;
if (atomic_read(&mem->refcnt) > 0) if (atomic_read(&mem->refcnt) > 0)
return; return;
for_each_node_state(node, N_POSSIBLE)
free_mem_cgroup_per_zone_info(mem, node);
if (mem_cgroup_size() < PAGE_SIZE) if (mem_cgroup_size() < PAGE_SIZE)
kfree(mem); kfree(mem);
else else
...@@ -1780,12 +1745,6 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, ...@@ -1780,12 +1745,6 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
static void mem_cgroup_destroy(struct cgroup_subsys *ss, static void mem_cgroup_destroy(struct cgroup_subsys *ss,
struct cgroup *cont) struct cgroup *cont)
{ {
int node;
struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
for_each_node_state(node, N_POSSIBLE)
free_mem_cgroup_per_zone_info(mem, node);
mem_cgroup_free(mem_cgroup_from_cont(cont)); mem_cgroup_free(mem_cgroup_from_cont(cont));
} }
......
...@@ -16,6 +16,7 @@ __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) ...@@ -16,6 +16,7 @@ __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
pc->flags = 0; pc->flags = 0;
pc->mem_cgroup = NULL; pc->mem_cgroup = NULL;
pc->page = pfn_to_page(pfn); pc->page = pfn_to_page(pfn);
INIT_LIST_HEAD(&pc->lru);
} }
static unsigned long total_usage; static unsigned long total_usage;
......
...@@ -168,7 +168,6 @@ void activate_page(struct page *page) ...@@ -168,7 +168,6 @@ void activate_page(struct page *page)
lru += LRU_ACTIVE; lru += LRU_ACTIVE;
add_page_to_lru_list(zone, page, lru); add_page_to_lru_list(zone, page, lru);
__count_vm_event(PGACTIVATE); __count_vm_event(PGACTIVATE);
mem_cgroup_move_lists(page, lru);
zone->recent_rotated[!!file]++; zone->recent_rotated[!!file]++;
zone->recent_scanned[!!file]++; zone->recent_scanned[!!file]++;
......
...@@ -512,7 +512,6 @@ void putback_lru_page(struct page *page) ...@@ -512,7 +512,6 @@ void putback_lru_page(struct page *page)
lru = LRU_UNEVICTABLE; lru = LRU_UNEVICTABLE;
add_page_to_unevictable_list(page); add_page_to_unevictable_list(page);
} }
mem_cgroup_move_lists(page, lru);
/* /*
* page's status can change while we move it among lru. If an evictable * page's status can change while we move it among lru. If an evictable
...@@ -547,7 +546,6 @@ void putback_lru_page(struct page *page) ...@@ -547,7 +546,6 @@ void putback_lru_page(struct page *page)
lru = !!TestClearPageActive(page) + page_is_file_cache(page); lru = !!TestClearPageActive(page) + page_is_file_cache(page);
lru_cache_add_lru(page, lru); lru_cache_add_lru(page, lru);
mem_cgroup_move_lists(page, lru);
put_page(page); put_page(page);
} }
#endif /* CONFIG_UNEVICTABLE_LRU */ #endif /* CONFIG_UNEVICTABLE_LRU */
...@@ -813,6 +811,7 @@ int __isolate_lru_page(struct page *page, int mode, int file) ...@@ -813,6 +811,7 @@ int __isolate_lru_page(struct page *page, int mode, int file)
return ret; return ret;
ret = -EBUSY; ret = -EBUSY;
if (likely(get_page_unless_zero(page))) { if (likely(get_page_unless_zero(page))) {
/* /*
* Be careful not to clear PageLRU until after we're * Be careful not to clear PageLRU until after we're
...@@ -821,6 +820,7 @@ int __isolate_lru_page(struct page *page, int mode, int file) ...@@ -821,6 +820,7 @@ int __isolate_lru_page(struct page *page, int mode, int file)
*/ */
ClearPageLRU(page); ClearPageLRU(page);
ret = 0; ret = 0;
mem_cgroup_del_lru(page);
} }
return ret; return ret;
...@@ -1134,7 +1134,6 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, ...@@ -1134,7 +1134,6 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
SetPageLRU(page); SetPageLRU(page);
lru = page_lru(page); lru = page_lru(page);
add_page_to_lru_list(zone, page, lru); add_page_to_lru_list(zone, page, lru);
mem_cgroup_move_lists(page, lru);
if (PageActive(page) && scan_global_lru(sc)) { if (PageActive(page) && scan_global_lru(sc)) {
int file = !!page_is_file_cache(page); int file = !!page_is_file_cache(page);
zone->recent_rotated[file]++; zone->recent_rotated[file]++;
...@@ -1263,7 +1262,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, ...@@ -1263,7 +1262,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
ClearPageActive(page); ClearPageActive(page);
list_move(&page->lru, &zone->lru[lru].list); list_move(&page->lru, &zone->lru[lru].list);
mem_cgroup_move_lists(page, lru); mem_cgroup_add_lru_list(page, lru);
pgmoved++; pgmoved++;
if (!pagevec_add(&pvec, page)) { if (!pagevec_add(&pvec, page)) {
__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
...@@ -2408,6 +2407,7 @@ static void check_move_unevictable_page(struct page *page, struct zone *zone) ...@@ -2408,6 +2407,7 @@ static void check_move_unevictable_page(struct page *page, struct zone *zone)
__dec_zone_state(zone, NR_UNEVICTABLE); __dec_zone_state(zone, NR_UNEVICTABLE);
list_move(&page->lru, &zone->lru[l].list); list_move(&page->lru, &zone->lru[l].list);
mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l);
__inc_zone_state(zone, NR_INACTIVE_ANON + l); __inc_zone_state(zone, NR_INACTIVE_ANON + l);
__count_vm_event(UNEVICTABLE_PGRESCUED); __count_vm_event(UNEVICTABLE_PGRESCUED);
} else { } else {
...@@ -2416,6 +2416,7 @@ static void check_move_unevictable_page(struct page *page, struct zone *zone) ...@@ -2416,6 +2416,7 @@ static void check_move_unevictable_page(struct page *page, struct zone *zone)
*/ */
SetPageUnevictable(page); SetPageUnevictable(page);
list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE);
if (page_evictable(page, NULL)) if (page_evictable(page, NULL))
goto retry; goto retry;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册