提交 52e375fc 编写于 作者: W Wenwei Tao 提交者: Shile Zhang

alinux: mm: memcontrol: introduce memcg priority oom

Under memory pressure reclaim and oom would happen, with multiple
cgroups exist in one system, we might want some of their memory
or tasks survived the reclaim and oom while there are other
candidates.

The @memory.low and @memory.min have make that happen during reclaim,
this patch introduces memcg priority oom to meet above requirement in
the oom.

The priority is from 0 to 12, the higher number the higher priority.
When oom happens it always choose victim from low priority memcg.
And it works both for memcg oom and global oom, it can be enabled/disabled
through @memory.use_priority_oom, for global oom through the root
memcg's @memory.use_priority_oom, it is disabled by default.
Signed-off-by: NWenwei Tao <wenwei.tao@linux.alibaba.com>
Reviewed-by: NXunlei Pang <xlpang@linux.alibaba.com>
上级 1e91d392
......@@ -36,6 +36,9 @@ struct mem_cgroup;
struct page;
struct mm_struct;
struct kmem_cache;
struct oom_control;
#define MEMCG_OOM_PRIORITY 12
/* Cgroup-specific page state, on top of universal node page state */
enum memcg_stat_item {
......@@ -252,6 +255,12 @@ struct mem_cgroup {
bool oom_lock;
int under_oom;
/* memcg priority */
bool use_priority_oom;
int priority;
int num_oom_skip;
struct mem_cgroup *next_reset;
int swappiness;
/* OOM-Killer disable */
int oom_kill_disable;
......@@ -533,6 +542,21 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
return !!(memcg->css.flags & CSS_ONLINE);
}
/* memcg priority*/
void mem_cgroup_account_oom_skip(struct task_struct *task,
struct oom_control *oc);
void mem_cgroup_select_bad_process(struct oom_control *oc);
static inline bool root_memcg_use_priority_oom(void)
{
if (mem_cgroup_disabled())
return false;
if (root_mem_cgroup->use_priority_oom)
return true;
return false;
}
/*
* For memory reclaim.
*/
......@@ -1035,6 +1059,21 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
return true;
}
/* memcg priority */
static inline void mem_cgroup_account_oom_skip(struct task_struct *task,
struct oom_control *oc)
{
}
static inline void mem_cgroup_select_bad_process(struct oom_control *oc)
{
}
static inline bool root_memcg_use_priority_oom(void)
{
return false;
}
static inline unsigned long
mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
{
......
......@@ -42,6 +42,11 @@ struct oom_control {
unsigned long totalpages;
struct task_struct *chosen;
unsigned long chosen_points;
/* Memcg priority */
struct mem_cgroup *reset_list;
int num_skip;
bool use_priority_oom;
};
extern struct mutex oom_lock;
......@@ -111,6 +116,8 @@ extern int unregister_oom_notifier(struct notifier_block *nb);
extern bool oom_killer_disable(signed long timeout);
extern void oom_killer_enable(void);
extern int oom_evaluate_task(struct task_struct *task, void *arg);
extern struct task_struct *find_lock_task_mm(struct task_struct *p);
/* sysctls */
......
......@@ -1078,6 +1078,130 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
dead_memcg);
}
/* memcg priority */
/*
* mem_cgroup_account_oom_skip - account the OOM-unkillable task
* @task: non OOM-killable task
* @oc: oom_control struct
*
* Account OOM-unkillable task to its cgroup and up to the OOMing cgroup's
* @num_oom_skip, if all the tasks of one cgroup hierarchy are OOM-unkillable
* we skip this cgroup hierarchy when select the victim cgroup.
*
* The @num_oom_skip must be reset when bad process selection has finished,
* since before the next round bad process selection, these OOM-unkillable
* tasks might become killable.
*
*/
void mem_cgroup_account_oom_skip(struct task_struct *task,
struct oom_control *oc)
{
struct mem_cgroup *root, *memcg;
struct cgroup_subsys_state *css;
if (!oc->use_priority_oom)
return;
root = oc->memcg;
if (!root)
root = root_mem_cgroup;
memcg = mem_cgroup_from_task(task);
if (unlikely(!memcg))
return;
css = &memcg->css;
while (css) {
struct mem_cgroup *tmp;
tmp = mem_cgroup_from_css(css);
tmp->num_oom_skip++;
/*
* Put these cgroups into a list to
* reduce the iteration time when reset
* the @num_oom_skip.
*/
if (!tmp->next_reset) {
css_get(&tmp->css);
tmp->next_reset = oc->reset_list;
oc->reset_list = tmp;
}
if (mem_cgroup_from_css(css) == root)
break;
css = css->parent;
}
}
static struct mem_cgroup *
mem_cgroup_select_victim_cgroup(struct mem_cgroup *memcg)
{
struct cgroup_subsys_state *chosen, *parent;
struct cgroup_subsys_state *victim;
int chosen_priority;
if (!memcg->use_hierarchy) {
css_get(&memcg->css);
return memcg;
}
again:
victim = NULL;
parent = &memcg->css;
rcu_read_lock();
while (parent) {
struct cgroup_subsys_state *pos;
struct mem_cgroup *parent_mem;
parent_mem = mem_cgroup_from_css(parent);
if (parent->nr_tasks <= parent_mem->num_oom_skip)
break;
victim = parent;
chosen = NULL;
chosen_priority = DEF_PRIORITY + 1;
list_for_each_entry_rcu(pos, &parent->children, sibling) {
struct mem_cgroup *tmp, *chosen_mem;
tmp = mem_cgroup_from_css(pos);
if (pos->nr_tasks <= tmp->num_oom_skip)
continue;
if (tmp->priority > chosen_priority)
continue;
if (tmp->priority < chosen_priority) {
chosen_priority = tmp->priority;
chosen = pos;
continue;
}
chosen_mem = mem_cgroup_from_css(chosen);
if (do_memsw_account()) {
if (page_counter_read(&tmp->memsw) >
page_counter_read(&chosen_mem->memsw))
chosen = pos;
} else if (page_counter_read(&tmp->memory) >
page_counter_read(&chosen_mem->memory)) {
chosen = pos;
}
}
parent = chosen;
}
if (likely(victim)) {
if (!css_tryget(victim)) {
rcu_read_unlock();
goto again;
}
}
rcu_read_unlock();
if (likely(victim))
return mem_cgroup_from_css(victim);
return NULL;
}
/**
* mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
* @memcg: hierarchy root
......@@ -1089,7 +1213,6 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
* value, the function breaks the iteration loop and returns the value.
* Otherwise, it will iterate over all tasks and return 0.
*
* This function must not be called for the root memory cgroup.
*/
int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
int (*fn)(struct task_struct *, void *), void *arg)
......@@ -1097,8 +1220,6 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
struct mem_cgroup *iter;
int ret = 0;
BUG_ON(memcg == root_mem_cgroup);
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
struct task_struct *task;
......@@ -1115,6 +1236,44 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
return ret;
}
void mem_cgroup_select_bad_process(struct oom_control *oc)
{
struct mem_cgroup *memcg, *victim, *iter;
memcg = oc->memcg;
if (!memcg)
memcg = root_mem_cgroup;
oc->use_priority_oom = memcg->use_priority_oom;
victim = memcg;
retry:
if (oc->use_priority_oom) {
victim = mem_cgroup_select_victim_cgroup(memcg);
if (!victim) {
if (mem_cgroup_is_root(memcg) && oc->num_skip)
oc->chosen = (void *)-1UL;
goto out;
}
}
mem_cgroup_scan_tasks(victim, oom_evaluate_task, oc);
if (oc->use_priority_oom) {
css_put(&victim->css);
if (!oc->chosen && victim != memcg)
goto retry;
}
out:
/* See commets in mem_cgroup_account_oom_skip() */
while (oc->reset_list) {
iter = oc->reset_list;
iter->num_oom_skip = 0;
oc->reset_list = iter->next_reset;
iter->next_reset = NULL;
css_put(&iter->css);
}
}
/**
* mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
* @page: the page
......@@ -3216,6 +3375,27 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
return retval;
}
static u64 mem_cgroup_priority_oom_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
return memcg->use_priority_oom;
}
static int mem_cgroup_priority_oom_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
if (val > 1)
return -EINVAL;
memcg->use_priority_oom = val;
return 0;
}
struct accumulated_stats {
unsigned long stat[MEMCG_NR_STAT];
unsigned long events[NR_VM_EVENT_ITEMS];
......@@ -4029,6 +4209,27 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
return 0;
}
static u64 mem_cgroup_priority_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
return memcg->priority;
}
static int mem_cgroup_priority_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
if (val > MEMCG_OOM_PRIORITY)
return -EINVAL;
memcg->priority = val;
return 0;
}
static int memory_wmark_ratio_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
......@@ -5149,6 +5350,11 @@ static struct cftype mem_cgroup_legacy_files[] = {
.write_u64 = mem_cgroup_hierarchy_write,
.read_u64 = mem_cgroup_hierarchy_read,
},
{
.name = "use_priority_oom",
.write_u64 = mem_cgroup_priority_oom_write,
.read_u64 = mem_cgroup_priority_oom_read,
},
{
.name = "cgroup.event_control", /* XXX: for compat */
.write = memcg_write_event_control,
......@@ -5159,6 +5365,12 @@ static struct cftype mem_cgroup_legacy_files[] = {
.read_u64 = mem_cgroup_swappiness_read,
.write_u64 = mem_cgroup_swappiness_write,
},
{
.name = "priority",
.read_u64 = mem_cgroup_priority_read,
.write_u64 = mem_cgroup_priority_write,
.flags = CFTYPE_NOT_ON_ROOT,
},
{
.name = "move_charge_at_immigrate",
.read_u64 = mem_cgroup_move_charge_read,
......@@ -6639,6 +6851,17 @@ static struct cftype memory_files[] = {
.seq_show = memory_wmark_scale_factor_show,
.write = memory_wmark_scale_factor_write,
},
{
.name = "priority",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = mem_cgroup_priority_read,
.write_u64 = mem_cgroup_priority_write,
},
{
.name = "use_priority_oom",
.write_u64 = mem_cgroup_priority_oom_write,
.read_u64 = mem_cgroup_priority_oom_read,
},
{
.name = "events",
.flags = CFTYPE_NOT_ON_ROOT,
......
......@@ -312,13 +312,15 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
return CONSTRAINT_NONE;
}
static int oom_evaluate_task(struct task_struct *task, void *arg)
int oom_evaluate_task(struct task_struct *task, void *arg)
{
struct oom_control *oc = arg;
unsigned long points;
if (oom_unkillable_task(task, NULL, oc->nodemask))
if (oom_unkillable_task(task, NULL, oc->nodemask)) {
mem_cgroup_account_oom_skip(task, oc);
goto next;
}
/*
* This task already has access to memory reserves and is being killed.
......@@ -327,8 +329,11 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
* any memory is quite low.
*/
if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) {
mem_cgroup_account_oom_skip(task, oc);
oc->num_skip++;
goto next;
}
goto abort;
}
......@@ -342,7 +347,11 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
}
points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
if (!points || points < oc->chosen_points)
if (!points) {
mem_cgroup_account_oom_skip(task, oc);
goto next;
}
if (points < oc->chosen_points)
goto next;
/* Prefer thread group leaders for display purposes */
......@@ -369,8 +378,8 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
*/
static void select_bad_process(struct oom_control *oc)
{
if (is_memcg_oom(oc))
mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
if (is_memcg_oom(oc) || root_memcg_use_priority_oom())
mem_cgroup_select_bad_process(oc);
else {
struct task_struct *p;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册