diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 84f91faed4de335d2a784ce5472bbe1910c07239..9b99df80959050bf79558ac943c2d282326fd4d2 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -36,6 +36,9 @@ struct mem_cgroup; struct page; struct mm_struct; struct kmem_cache; +struct oom_control; + +#define MEMCG_OOM_PRIORITY 12 /* Cgroup-specific page state, on top of universal node page state */ enum memcg_stat_item { @@ -252,6 +255,12 @@ struct mem_cgroup { bool oom_lock; int under_oom; + /* memcg priority */ + bool use_priority_oom; + int priority; + int num_oom_skip; + struct mem_cgroup *next_reset; + int swappiness; /* OOM-Killer disable */ int oom_kill_disable; @@ -533,6 +542,21 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) return !!(memcg->css.flags & CSS_ONLINE); } +/* memcg priority*/ +void mem_cgroup_account_oom_skip(struct task_struct *task, + struct oom_control *oc); + +void mem_cgroup_select_bad_process(struct oom_control *oc); + +static inline bool root_memcg_use_priority_oom(void) +{ + if (mem_cgroup_disabled()) + return false; + if (root_mem_cgroup->use_priority_oom) + return true; + return false; +} + /* * For memory reclaim. */ @@ -1035,6 +1059,21 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) return true; } +/* memcg priority */ +static inline void mem_cgroup_account_oom_skip(struct task_struct *task, + struct oom_control *oc) +{ +} + +static inline void mem_cgroup_select_bad_process(struct oom_control *oc) +{ +} + +static inline bool root_memcg_use_priority_oom(void) +{ + return false; +} + static inline unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { diff --git a/include/linux/oom.h b/include/linux/oom.h index 69864a547663ecda818ac17c171ed085dbe33c26..53ab3299678809e6bbd86b2193094b2ce3896c5d 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -42,6 +42,11 @@ struct oom_control { unsigned long totalpages; struct task_struct *chosen; unsigned long chosen_points; + + /* Memcg priority */ + struct mem_cgroup *reset_list; + int num_skip; + bool use_priority_oom; }; extern struct mutex oom_lock; @@ -111,6 +116,8 @@ extern int unregister_oom_notifier(struct notifier_block *nb); extern bool oom_killer_disable(signed long timeout); extern void oom_killer_enable(void); +extern int oom_evaluate_task(struct task_struct *task, void *arg); + extern struct task_struct *find_lock_task_mm(struct task_struct *p); /* sysctls */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a6ee0c17e466d6658cac7e392e582e3e9a96c05d..a9e51ec5e3e28ca62e41cbf6ac87fd223b5efbfd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1078,6 +1078,130 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) dead_memcg); } +/* memcg priority */ +/* + * mem_cgroup_account_oom_skip - account the OOM-unkillable task + * @task: non OOM-killable task + * @oc: oom_control struct + * + * Account OOM-unkillable task to its cgroup and up to the OOMing cgroup's + * @num_oom_skip, if all the tasks of one cgroup hierarchy are OOM-unkillable + * we skip this cgroup hierarchy when select the victim cgroup. + * + * The @num_oom_skip must be reset when bad process selection has finished, + * since before the next round bad process selection, these OOM-unkillable + * tasks might become killable. + * + */ +void mem_cgroup_account_oom_skip(struct task_struct *task, + struct oom_control *oc) +{ + struct mem_cgroup *root, *memcg; + struct cgroup_subsys_state *css; + + if (!oc->use_priority_oom) + return; + root = oc->memcg; + if (!root) + root = root_mem_cgroup; + + memcg = mem_cgroup_from_task(task); + if (unlikely(!memcg)) + return; + css = &memcg->css; + while (css) { + struct mem_cgroup *tmp; + + tmp = mem_cgroup_from_css(css); + tmp->num_oom_skip++; + /* + * Put these cgroups into a list to + * reduce the iteration time when reset + * the @num_oom_skip. + */ + if (!tmp->next_reset) { + css_get(&tmp->css); + tmp->next_reset = oc->reset_list; + oc->reset_list = tmp; + } + + if (mem_cgroup_from_css(css) == root) + break; + + css = css->parent; + } +} + +static struct mem_cgroup * +mem_cgroup_select_victim_cgroup(struct mem_cgroup *memcg) +{ + struct cgroup_subsys_state *chosen, *parent; + struct cgroup_subsys_state *victim; + int chosen_priority; + + if (!memcg->use_hierarchy) { + css_get(&memcg->css); + return memcg; + } +again: + victim = NULL; + parent = &memcg->css; + rcu_read_lock(); + while (parent) { + struct cgroup_subsys_state *pos; + struct mem_cgroup *parent_mem; + + parent_mem = mem_cgroup_from_css(parent); + + if (parent->nr_tasks <= parent_mem->num_oom_skip) + break; + victim = parent; + chosen = NULL; + chosen_priority = DEF_PRIORITY + 1; + list_for_each_entry_rcu(pos, &parent->children, sibling) { + struct mem_cgroup *tmp, *chosen_mem; + + tmp = mem_cgroup_from_css(pos); + + if (pos->nr_tasks <= tmp->num_oom_skip) + continue; + if (tmp->priority > chosen_priority) + continue; + if (tmp->priority < chosen_priority) { + chosen_priority = tmp->priority; + chosen = pos; + continue; + } + + chosen_mem = mem_cgroup_from_css(chosen); + + if (do_memsw_account()) { + if (page_counter_read(&tmp->memsw) > + page_counter_read(&chosen_mem->memsw)) + chosen = pos; + } else if (page_counter_read(&tmp->memory) > + page_counter_read(&chosen_mem->memory)) { + chosen = pos; + } + } + parent = chosen; + } + + if (likely(victim)) { + if (!css_tryget(victim)) { + rcu_read_unlock(); + goto again; + } + } + + rcu_read_unlock(); + + if (likely(victim)) + return mem_cgroup_from_css(victim); + + return NULL; +} + /** * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy * @memcg: hierarchy root @@ -1089,7 +1213,6 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) * value, the function breaks the iteration loop and returns the value. * Otherwise, it will iterate over all tasks and return 0. * - * This function must not be called for the root memory cgroup. */ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, int (*fn)(struct task_struct *, void *), void *arg) @@ -1097,8 +1220,6 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, struct mem_cgroup *iter; int ret = 0; - BUG_ON(memcg == root_mem_cgroup); - for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; struct task_struct *task; @@ -1115,6 +1236,44 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, return ret; } +void mem_cgroup_select_bad_process(struct oom_control *oc) +{ + struct mem_cgroup *memcg, *victim, *iter; + + memcg = oc->memcg; + + if (!memcg) + memcg = root_mem_cgroup; + + oc->use_priority_oom = memcg->use_priority_oom; + victim = memcg; + +retry: + if (oc->use_priority_oom) { + victim = mem_cgroup_select_victim_cgroup(memcg); + if (!victim) { + if (mem_cgroup_is_root(memcg) && oc->num_skip) + oc->chosen = (void *)-1UL; + goto out; + } + } + + mem_cgroup_scan_tasks(victim, oom_evaluate_task, oc); + if (oc->use_priority_oom) { + css_put(&victim->css); + if (!oc->chosen && victim != memcg) + goto retry; + } +out: + /* See commets in mem_cgroup_account_oom_skip() */ + while (oc->reset_list) { + iter = oc->reset_list; + iter->num_oom_skip = 0; + oc->reset_list = iter->next_reset; + iter->next_reset = NULL; + css_put(&iter->css); + } +} /** * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page * @page: the page @@ -3216,6 +3375,27 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, return retval; } +static u64 mem_cgroup_priority_oom_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return memcg->use_priority_oom; +} + +static int mem_cgroup_priority_oom_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (val > 1) + return -EINVAL; + + memcg->use_priority_oom = val; + + return 0; +} + struct accumulated_stats { unsigned long stat[MEMCG_NR_STAT]; unsigned long events[NR_VM_EVENT_ITEMS]; @@ -4029,6 +4209,27 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, return 0; } +static u64 mem_cgroup_priority_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return memcg->priority; +} + +static int mem_cgroup_priority_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (val > MEMCG_OOM_PRIORITY) + return -EINVAL; + + memcg->priority = val; + + return 0; +} + static int memory_wmark_ratio_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); @@ -5149,6 +5350,11 @@ static struct cftype mem_cgroup_legacy_files[] = { .write_u64 = mem_cgroup_hierarchy_write, .read_u64 = mem_cgroup_hierarchy_read, }, + { + .name = "use_priority_oom", + .write_u64 = mem_cgroup_priority_oom_write, + .read_u64 = mem_cgroup_priority_oom_read, + }, { .name = "cgroup.event_control", /* XXX: for compat */ .write = memcg_write_event_control, @@ -5159,6 +5365,12 @@ static struct cftype mem_cgroup_legacy_files[] = { .read_u64 = mem_cgroup_swappiness_read, .write_u64 = mem_cgroup_swappiness_write, }, + { + .name = "priority", + .read_u64 = mem_cgroup_priority_read, + .write_u64 = mem_cgroup_priority_write, + .flags = CFTYPE_NOT_ON_ROOT, + }, { .name = "move_charge_at_immigrate", .read_u64 = mem_cgroup_move_charge_read, @@ -6639,6 +6851,17 @@ static struct cftype memory_files[] = { .seq_show = memory_wmark_scale_factor_show, .write = memory_wmark_scale_factor_write, }, + { + .name = "priority", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = mem_cgroup_priority_read, + .write_u64 = mem_cgroup_priority_write, + }, + { + .name = "use_priority_oom", + .write_u64 = mem_cgroup_priority_oom_write, + .read_u64 = mem_cgroup_priority_oom_read, + }, { .name = "events", .flags = CFTYPE_NOT_ON_ROOT, diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a581fe2a2f1fe2736022d7855ef8b90ba2c7fd3f..e1290082c822b947e0f48acd360419658caff2e9 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -312,13 +312,15 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) return CONSTRAINT_NONE; } -static int oom_evaluate_task(struct task_struct *task, void *arg) +int oom_evaluate_task(struct task_struct *task, void *arg) { struct oom_control *oc = arg; unsigned long points; - if (oom_unkillable_task(task, NULL, oc->nodemask)) + if (oom_unkillable_task(task, NULL, oc->nodemask)) { + mem_cgroup_account_oom_skip(task, oc); goto next; + } /* * This task already has access to memory reserves and is being killed. @@ -327,8 +329,11 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) * any memory is quite low. */ if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) { - if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) + if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) { + mem_cgroup_account_oom_skip(task, oc); + oc->num_skip++; goto next; + } goto abort; } @@ -342,7 +347,11 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) } points = oom_badness(task, NULL, oc->nodemask, oc->totalpages); - if (!points || points < oc->chosen_points) + if (!points) { + mem_cgroup_account_oom_skip(task, oc); + goto next; + } + if (points < oc->chosen_points) goto next; /* Prefer thread group leaders for display purposes */ @@ -369,8 +378,8 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) */ static void select_bad_process(struct oom_control *oc) { - if (is_memcg_oom(oc)) - mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc); + if (is_memcg_oom(oc) || root_memcg_use_priority_oom()) + mem_cgroup_select_bad_process(oc); else { struct task_struct *p;