提交 bb4cc1a8 编写于 作者: A Andrew Morton 提交者: Linus Torvalds

revert "memcg: get rid of soft-limit tree infrastructure"

Revert commit e883110a ("memcg: get rid of soft-limit tree
infrastructure")

I merged this prematurely - Michal and Johannes still disagree about the
overall design direction and the future remains unclear.

Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
上级 b1aff7fc
...@@ -39,6 +39,7 @@ ...@@ -39,6 +39,7 @@
#include <linux/limits.h> #include <linux/limits.h>
#include <linux/export.h> #include <linux/export.h>
#include <linux/mutex.h> #include <linux/mutex.h>
#include <linux/rbtree.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/swap.h> #include <linux/swap.h>
#include <linux/swapops.h> #include <linux/swapops.h>
...@@ -124,6 +125,7 @@ static const char * const mem_cgroup_lru_names[] = { ...@@ -124,6 +125,7 @@ static const char * const mem_cgroup_lru_names[] = {
*/ */
enum mem_cgroup_events_target { enum mem_cgroup_events_target {
MEM_CGROUP_TARGET_THRESH, MEM_CGROUP_TARGET_THRESH,
MEM_CGROUP_TARGET_SOFTLIMIT,
MEM_CGROUP_TARGET_NUMAINFO, MEM_CGROUP_TARGET_NUMAINFO,
MEM_CGROUP_NTARGETS, MEM_CGROUP_NTARGETS,
}; };
...@@ -159,6 +161,10 @@ struct mem_cgroup_per_zone { ...@@ -159,6 +161,10 @@ struct mem_cgroup_per_zone {
struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
struct rb_node tree_node; /* RB tree node */
unsigned long long usage_in_excess;/* Set to the value by which */
/* the soft limit is exceeded*/
bool on_tree;
struct mem_cgroup *memcg; /* Back pointer, we cannot */ struct mem_cgroup *memcg; /* Back pointer, we cannot */
/* use container_of */ /* use container_of */
}; };
...@@ -167,6 +173,26 @@ struct mem_cgroup_per_node { ...@@ -167,6 +173,26 @@ struct mem_cgroup_per_node {
struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
}; };
/*
* Cgroups above their limits are maintained in a RB-Tree, independent of
* their hierarchy representation
*/
struct mem_cgroup_tree_per_zone {
struct rb_root rb_root;
spinlock_t lock;
};
struct mem_cgroup_tree_per_node {
struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
};
struct mem_cgroup_tree {
struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
};
static struct mem_cgroup_tree soft_limit_tree __read_mostly;
struct mem_cgroup_threshold { struct mem_cgroup_threshold {
struct eventfd_ctx *eventfd; struct eventfd_ctx *eventfd;
u64 threshold; u64 threshold;
...@@ -405,6 +431,7 @@ static bool move_file(void) ...@@ -405,6 +431,7 @@ static bool move_file(void)
* limit reclaim to prevent infinite loops, if they ever occur. * limit reclaim to prevent infinite loops, if they ever occur.
*/ */
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
enum charge_type { enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = 0, MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
...@@ -631,6 +658,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) ...@@ -631,6 +658,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
return mem_cgroup_zoneinfo(memcg, nid, zid); return mem_cgroup_zoneinfo(memcg, nid, zid);
} }
static struct mem_cgroup_tree_per_zone *
soft_limit_tree_node_zone(int nid, int zid)
{
return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
}
static struct mem_cgroup_tree_per_zone *
soft_limit_tree_from_page(struct page *page)
{
int nid = page_to_nid(page);
int zid = page_zonenum(page);
return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
}
static void
__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
struct mem_cgroup_per_zone *mz,
struct mem_cgroup_tree_per_zone *mctz,
unsigned long long new_usage_in_excess)
{
struct rb_node **p = &mctz->rb_root.rb_node;
struct rb_node *parent = NULL;
struct mem_cgroup_per_zone *mz_node;
if (mz->on_tree)
return;
mz->usage_in_excess = new_usage_in_excess;
if (!mz->usage_in_excess)
return;
while (*p) {
parent = *p;
mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
tree_node);
if (mz->usage_in_excess < mz_node->usage_in_excess)
p = &(*p)->rb_left;
/*
* We can't avoid mem cgroups that are over their soft
* limit by the same amount
*/
else if (mz->usage_in_excess >= mz_node->usage_in_excess)
p = &(*p)->rb_right;
}
rb_link_node(&mz->tree_node, parent, p);
rb_insert_color(&mz->tree_node, &mctz->rb_root);
mz->on_tree = true;
}
static void
__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
struct mem_cgroup_per_zone *mz,
struct mem_cgroup_tree_per_zone *mctz)
{
if (!mz->on_tree)
return;
rb_erase(&mz->tree_node, &mctz->rb_root);
mz->on_tree = false;
}
static void
mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
struct mem_cgroup_per_zone *mz,
struct mem_cgroup_tree_per_zone *mctz)
{
spin_lock(&mctz->lock);
__mem_cgroup_remove_exceeded(memcg, mz, mctz);
spin_unlock(&mctz->lock);
}
static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
{
unsigned long long excess;
struct mem_cgroup_per_zone *mz;
struct mem_cgroup_tree_per_zone *mctz;
int nid = page_to_nid(page);
int zid = page_zonenum(page);
mctz = soft_limit_tree_from_page(page);
/*
* Necessary to update all ancestors when hierarchy is used.
* because their event counter is not touched.
*/
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
mz = mem_cgroup_zoneinfo(memcg, nid, zid);
excess = res_counter_soft_limit_excess(&memcg->res);
/*
* We have to update the tree if mz is on RB-tree or
* mem is over its softlimit.
*/
if (excess || mz->on_tree) {
spin_lock(&mctz->lock);
/* if on-tree, remove it */
if (mz->on_tree)
__mem_cgroup_remove_exceeded(memcg, mz, mctz);
/*
* Insert again. mz->usage_in_excess will be updated.
* If excess is 0, no tree ops.
*/
__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
spin_unlock(&mctz->lock);
}
}
}
static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
{
int node, zone;
struct mem_cgroup_per_zone *mz;
struct mem_cgroup_tree_per_zone *mctz;
for_each_node(node) {
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = mem_cgroup_zoneinfo(memcg, node, zone);
mctz = soft_limit_tree_node_zone(node, zone);
mem_cgroup_remove_exceeded(memcg, mz, mctz);
}
}
}
static struct mem_cgroup_per_zone *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
{
struct rb_node *rightmost = NULL;
struct mem_cgroup_per_zone *mz;
retry:
mz = NULL;
rightmost = rb_last(&mctz->rb_root);
if (!rightmost)
goto done; /* Nothing to reclaim from */
mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
/*
* Remove the node now but someone else can add it back,
* we will to add it back at the end of reclaim to its correct
* position in the tree.
*/
__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
!css_tryget(&mz->memcg->css))
goto retry;
done:
return mz;
}
static struct mem_cgroup_per_zone *
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
{
struct mem_cgroup_per_zone *mz;
spin_lock(&mctz->lock);
mz = __mem_cgroup_largest_soft_limit_node(mctz);
spin_unlock(&mctz->lock);
return mz;
}
/* /*
* Implementation Note: reading percpu statistics for memcg. * Implementation Note: reading percpu statistics for memcg.
* *
...@@ -789,6 +974,9 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, ...@@ -789,6 +974,9 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
case MEM_CGROUP_TARGET_THRESH: case MEM_CGROUP_TARGET_THRESH:
next = val + THRESHOLDS_EVENTS_TARGET; next = val + THRESHOLDS_EVENTS_TARGET;
break; break;
case MEM_CGROUP_TARGET_SOFTLIMIT:
next = val + SOFTLIMIT_EVENTS_TARGET;
break;
case MEM_CGROUP_TARGET_NUMAINFO: case MEM_CGROUP_TARGET_NUMAINFO:
next = val + NUMAINFO_EVENTS_TARGET; next = val + NUMAINFO_EVENTS_TARGET;
break; break;
...@@ -811,8 +999,11 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) ...@@ -811,8 +999,11 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
/* threshold event is triggered in finer grain than soft limit */ /* threshold event is triggered in finer grain than soft limit */
if (unlikely(mem_cgroup_event_ratelimit(memcg, if (unlikely(mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_THRESH))) { MEM_CGROUP_TARGET_THRESH))) {
bool do_softlimit;
bool do_numainfo __maybe_unused; bool do_numainfo __maybe_unused;
do_softlimit = mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_SOFTLIMIT);
#if MAX_NUMNODES > 1 #if MAX_NUMNODES > 1
do_numainfo = mem_cgroup_event_ratelimit(memcg, do_numainfo = mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_NUMAINFO); MEM_CGROUP_TARGET_NUMAINFO);
...@@ -820,6 +1011,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) ...@@ -820,6 +1011,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
preempt_enable(); preempt_enable();
mem_cgroup_threshold(memcg); mem_cgroup_threshold(memcg);
if (unlikely(do_softlimit))
mem_cgroup_update_tree(memcg, page);
#if MAX_NUMNODES > 1 #if MAX_NUMNODES > 1
if (unlikely(do_numainfo)) if (unlikely(do_numainfo))
atomic_inc(&memcg->numainfo_events); atomic_inc(&memcg->numainfo_events);
...@@ -1661,7 +1854,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, ...@@ -1661,7 +1854,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
return total; return total;
} }
#if MAX_NUMNODES > 1
/** /**
* test_mem_cgroup_node_reclaimable * test_mem_cgroup_node_reclaimable
* @memcg: the target memcg * @memcg: the target memcg
...@@ -1684,6 +1876,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, ...@@ -1684,6 +1876,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
return false; return false;
} }
#if MAX_NUMNODES > 1
/* /*
* Always updating the nodemask is not very good - even if we have an empty * Always updating the nodemask is not very good - even if we have an empty
...@@ -1751,12 +1944,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) ...@@ -1751,12 +1944,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
return node; return node;
} }
/*
* Check all nodes whether it contains reclaimable pages or not.
* For quick scan, we make use of scan_nodes. This will allow us to skip
* unused nodes. But scan_nodes is lazily updated and may not cotain
* enough new information. We need to do double check.
*/
static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
{
int nid;
/*
* quick check...making use of scan_node.
* We can skip unused nodes.
*/
if (!nodes_empty(memcg->scan_nodes)) {
for (nid = first_node(memcg->scan_nodes);
nid < MAX_NUMNODES;
nid = next_node(nid, memcg->scan_nodes)) {
if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
return true;
}
}
/*
* Check rest of nodes.
*/
for_each_node_state(nid, N_MEMORY) {
if (node_isset(nid, memcg->scan_nodes))
continue;
if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
return true;
}
return false;
}
#else #else
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
{ {
return 0; return 0;
} }
static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
{
return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
}
#endif #endif
/* /*
...@@ -2692,7 +2924,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, ...@@ -2692,7 +2924,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
unlock_page_cgroup(pc); unlock_page_cgroup(pc);
/* /*
* "charge_statistics" updated event counter. * "charge_statistics" updated event counter. Then, check it.
* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
* if they exceeds softlimit.
*/ */
memcg_check_events(memcg, page); memcg_check_events(memcg, page);
} }
...@@ -5791,6 +6025,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) ...@@ -5791,6 +6025,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
for (zone = 0; zone < MAX_NR_ZONES; zone++) { for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = &pn->zoneinfo[zone]; mz = &pn->zoneinfo[zone];
lruvec_init(&mz->lruvec); lruvec_init(&mz->lruvec);
mz->usage_in_excess = 0;
mz->on_tree = false;
mz->memcg = memcg; mz->memcg = memcg;
} }
memcg->nodeinfo[node] = pn; memcg->nodeinfo[node] = pn;
...@@ -5846,6 +6082,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) ...@@ -5846,6 +6082,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
int node; int node;
size_t size = memcg_size(); size_t size = memcg_size();
mem_cgroup_remove_from_trees(memcg);
free_css_id(&mem_cgroup_subsys, &memcg->css); free_css_id(&mem_cgroup_subsys, &memcg->css);
for_each_node(node) for_each_node(node)
...@@ -5882,6 +6119,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) ...@@ -5882,6 +6119,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
} }
EXPORT_SYMBOL(parent_mem_cgroup); EXPORT_SYMBOL(parent_mem_cgroup);
static void __init mem_cgroup_soft_limit_tree_init(void)
{
struct mem_cgroup_tree_per_node *rtpn;
struct mem_cgroup_tree_per_zone *rtpz;
int tmp, node, zone;
for_each_node(node) {
tmp = node;
if (!node_state(node, N_NORMAL_MEMORY))
tmp = -1;
rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
BUG_ON(!rtpn);
soft_limit_tree.rb_tree_per_node[node] = rtpn;
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
rtpz = &rtpn->rb_tree_per_zone[zone];
rtpz->rb_root = RB_ROOT;
spin_lock_init(&rtpz->lock);
}
}
}
static struct cgroup_subsys_state * __ref static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{ {
...@@ -6662,6 +6922,7 @@ static int __init mem_cgroup_init(void) ...@@ -6662,6 +6922,7 @@ static int __init mem_cgroup_init(void)
{ {
hotcpu_notifier(memcg_cpu_hotplug_callback, 0); hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
enable_swap_cgroup(); enable_swap_cgroup();
mem_cgroup_soft_limit_tree_init();
memcg_stock_init(); memcg_stock_init();
return 0; return 0;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册