提交 a3f9b995 编写于 作者: J Jing Xiangfeng 提交者: Yang Yingliang

memcg: support priority for oom

hulk inclusion
category: feature
bugzilla: 51827
CVE: NA

--------------------------------------

we first kill the process from the low priority memcg if OOM occurs.
If the process is not found, then fallback to normal handle.
Signed-off-by: NJing Xiangfeng <jingxiangfeng@huawei.com>
Reviewed-by: NKefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: NYang Yingliang <yangyingliang@huawei.com>
上级 ec6deb3a
......@@ -287,6 +287,10 @@ struct mem_cgroup {
bool tcpmem_active;
int tcpmem_pressure;
#ifdef CONFIG_MEMCG_QOS
/* Currently support 0 and -1, in the future it can expand to other value */
int memcg_priority;
#endif
#ifdef CONFIG_MEMCG_KMEM
/* Index in the kmem_cache->memcg_params.memcg_caches array */
int kmemcg_id;
......@@ -321,6 +325,12 @@ struct mem_cgroup_extension {
struct mem_cgroup memcg;
};
#ifdef CONFIG_MEMCG_QOS
bool memcg_low_priority_scan_tasks(int (*)(struct task_struct *, void *),
void *);
void memcg_print_bad_task(void *arg, int ret);
#endif
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
* TODO: maybe necessary to use big numbers in big irons.
......
......@@ -512,6 +512,18 @@ config USERSWAP
Support for User Swap. This is based on userfaultfd. We can implement
our own swapout and swapin functions in usersapce.
config MEMCG_QOS
bool "Enable Memory Cgroup Priority"
depends on MEMCG
depends on X86 || ARM64
default y
help
MEMCG_QOS means that we first kill the process from the low priority
memcg if OOM occurs. If the process is not found, then fallback to
normal handle.
If unsure, say "n".
config CMA
bool "Contiguous Memory Allocator"
depends on HAVE_MEMBLOCK && MMU
......
......@@ -1105,6 +1105,9 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
break;
}
}
#ifdef CONFIG_MEMCG_QOS
memcg_print_bad_task(arg, ret);
#endif
return ret;
}
......@@ -3400,6 +3403,118 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
}
#endif
#ifdef CONFIG_MEMCG_QOS
static void memcg_qos_init(struct mem_cgroup *memcg)
{
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
if (!parent)
return;
if (parent->memcg_priority && parent->use_hierarchy)
memcg->memcg_priority = parent->memcg_priority;
}
static s64 memcg_qos_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
return mem_cgroup_from_css(css)->memcg_priority;
}
static int memcg_qos_write(struct cgroup_subsys_state *css,
struct cftype *cft, s64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
if (val >= 0) {
memcg->memcg_priority = 0;
} else {
memcg->memcg_priority = -1;
}
return 0;
}
static struct mem_cgroup *memcg_find_max_usage(struct mem_cgroup *last)
{
struct mem_cgroup *iter, *max_memcg = NULL;
struct cgroup_subsys_state *css;
unsigned long usage, max_usage = 0;
rcu_read_lock();
css_for_each_descendant_pre(css, &root_mem_cgroup->css) {
iter = mem_cgroup_from_css(css);
if (!iter->memcg_priority || iter == root_mem_cgroup || iter == last)
continue;
usage = mem_cgroup_usage(iter, false);
if (usage > max_usage) {
max_usage = usage;
max_memcg = iter;
}
}
rcu_read_unlock();
return max_memcg;
}
bool memcg_low_priority_scan_tasks(int (*fn)(struct task_struct *, void *),
void *arg)
{
struct mem_cgroup *max, *last = NULL;
struct oom_control *oc = arg;
struct css_task_iter it;
struct task_struct *task;
int ret = 0;
bool retry = true;
retry:
max = memcg_find_max_usage(last);
if (!max)
return false;
css_task_iter_start(&max->css, CSS_TASK_ITER_PROCS, &it);
while (!ret && (task = css_task_iter_next(&it))) {
if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
pr_info("task %s is dying.\n", task->comm);
continue;
}
ret = fn(task, arg);
}
css_task_iter_end(&it);
if (ret)
return false;
if (!oc->chosen && retry) {
last = max;
retry = false;
goto retry;
}
if (oc->chosen)
pr_info("The bad task [%d:%s] is from low-priority memcg.\n",
oc->chosen->pid, oc->chosen->comm);
return oc->chosen ? true : false;
}
void memcg_print_bad_task(void *arg, int ret)
{
struct oom_control *oc = arg;
if (!ret && oc->chosen) {
struct mem_cgroup *memcg;
memcg = mem_cgroup_from_task(oc->chosen);
if (memcg->memcg_priority)
pr_info("The bad task [%d:%s] is from low-priority memcg.\n",
oc->chosen->pid, oc->chosen->comm);
}
}
#endif
#ifdef CONFIG_NUMA
static int memcg_numa_stat_show(struct seq_file *m, void *v)
{
......@@ -4324,6 +4439,13 @@ static struct cftype mem_cgroup_legacy_files[] = {
{
.name = "pressure_level",
},
#ifdef CONFIG_MEMCG_QOS
{
.name = "qos_level",
.read_s64 = memcg_qos_read,
.write_s64 = memcg_qos_write,
},
#endif
#ifdef CONFIG_NUMA
{
.name = "numa_stat",
......@@ -4657,6 +4779,10 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
return -ENOMEM;
}
#ifdef CONFIG_MEMCG_QOS
memcg_qos_init(memcg);
#endif
/* Online state pins memcg ID, memcg ID pins CSS */
atomic_set(&memcg->id.ref, 1);
css_get(css);
......
......@@ -317,6 +317,49 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
return CONSTRAINT_NONE;
}
#ifdef CONFIG_MEMCG_QOS
/**
* We choose the task in low-priority memcg firstly. For the same state, we choose
* the task with the highest number of 'points'.
*/
static bool oom_next_task(struct task_struct *task, struct oom_control *oc,
unsigned long points)
{
struct mem_cgroup *cur_memcg;
struct mem_cgroup *oc_memcg;
if (!points)
return true;
if (!oc->chosen)
return false;
oc_memcg = mem_cgroup_from_task(oc->chosen);
cur_memcg = mem_cgroup_from_task(task);
if (cur_memcg->memcg_priority == oc_memcg->memcg_priority) {
if (points < oc->chosen_points)
return true;
return false;
}
/* if oc is low-priority, so skip the task */
if (oc_memcg->memcg_priority)
return true;
return false;
}
#else
static inline bool oom_next_task(struct task_struct *task, struct oom_control *oc,
unsigned long points)
{
if (!points || points < oc->chosen_points)
return true;
return false;
}
#endif
static int oom_evaluate_task(struct task_struct *task, void *arg)
{
struct oom_control *oc = arg;
......@@ -347,7 +390,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
}
points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
if (!points || points < oc->chosen_points)
if (oom_next_task(task, oc, points))
goto next;
select:
......@@ -376,6 +419,12 @@ static void select_bad_process(struct oom_control *oc)
else {
struct task_struct *p;
#ifdef CONFIG_MEMCG_QOS
if (memcg_low_priority_scan_tasks(oom_evaluate_task, oc)) {
oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
return;
}
#endif
rcu_read_lock();
for_each_process(p)
if (oom_evaluate_task(p, oc))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册