diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 719eba285430938c0daa6f447847517ce20935e1..cbe792ff7f7c5215494891d745c00be115aed583 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -422,6 +422,7 @@ static inline void cgroup_put(struct cgroup *cgrp) css_put(&cgrp->self); } +extern struct mutex cgroup_mutex; /** * task_css_set_check - obtain a task's css_set with extra access conditions * @task: the task to obtain css_set for @@ -436,7 +437,6 @@ static inline void cgroup_put(struct cgroup *cgrp) * as locks used during the cgroup_subsys::attach() methods. */ #ifdef CONFIG_PROVE_RCU -extern struct mutex cgroup_mutex; extern spinlock_t css_set_lock; #define task_css_set_check(task, __c) \ rcu_dereference_check((task)->cgroups, \ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8011b8ea459a3ae3a09dea454db148116069940e..a8e2e9d6b61b5cd7fcbe5194a26320d4c0aaeeae 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -72,6 +72,8 @@ struct mem_cgroup_reclaim_cookie { unsigned int generation; }; +struct alloc_context; + #ifdef CONFIG_MEMCG #define MEM_CGROUP_ID_SHIFT 16 @@ -296,6 +298,9 @@ struct mem_cgroup { bool tcpmem_active; int tcpmem_pressure; + int wmark_min_adj; /* user-set value */ + int wmark_min_eadj; /* value in effect */ + unsigned int wmark_ratio; struct work_struct wmark_work; unsigned int wmark_scale_factor; @@ -550,6 +555,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, } void mem_cgroup_handle_over_high(void); +void mem_cgroup_wmark_min_throttle(void); unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); @@ -858,6 +864,9 @@ static inline bool is_wmark_ok(struct mem_cgroup *memcg, bool high) return page_counter_read(&memcg->memory) < memcg->memory.wmark_low; } +int memcg_get_wmark_min_adj(struct task_struct *curr); +void memcg_check_wmark_min_adj(struct task_struct *curr, + struct alloc_context *ac); #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1059,6 +1068,10 @@ static inline void mem_cgroup_handle_over_high(void) { } +static inline void mem_cgroup_wmark_min_throttle(void) +{ +} + static inline void mem_cgroup_enter_user_fault(void) { } @@ -1179,6 +1192,16 @@ static inline bool is_wmark_ok(struct mem_cgroup *memcg, bool low) { return false; } + +static inline int memcg_get_wmark_min_adj(struct task_struct *curr) +{ + return 0; +} + +static inline void memcg_check_wmark_min_adj(struct task_struct *curr, + struct alloc_context *ac) +{ +} #endif /* CONFIG_MEMCG */ /* idx can be of type enum memcg_stat_item or node_stat_item */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 4b4a90b32d837876ea0e250cb1fdbe078fdd4751..8ace05026bfb515f0ed4d90f819ab841d8ad031d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1179,6 +1179,7 @@ struct task_struct { /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; + unsigned int wmark_min_throttle_ms; /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 05589a3e37f47992f7390d56e84606ef1b3f8888..36f7b9885a000b67832e395dd12734042b58ddce 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -193,6 +193,7 @@ static inline void tracehook_notify_resume(struct pt_regs *regs) task_work_run(); mem_cgroup_handle_over_high(); + mem_cgroup_wmark_min_throttle(); blkcg_maybe_throttle_current(); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1863abb714aec245b2c44aba4b141f3ef28b7fdd..64a6d7449cee50fc72a1f9e50204035f0dbef33d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -56,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -65,7 +67,6 @@ #include #include #include -#include #include "internal.h" #include #include @@ -4086,6 +4087,168 @@ static ssize_t memory_wmark_scale_factor_write(struct kernfs_open_file *of, return nbytes; } +/* + * Figure out the maximal(most conservative) @wmark_min_adj along + * the hierarchy but excluding intermediate default zero, as the + * effective one. Example: + * root + * / \ + * A D + * / \ + * B C + * / \ + * E F + * + * wmark_min_adj: A -10, B -25, C 0, D 50, E -25, F 50 + * wmark_min_eadj: A -10, B -10, C 0, D 50, E -10, F 50 + */ +static void memcg_update_wmark_min_adj(struct mem_cgroup *memcg, int val) +{ + struct mem_cgroup *p; + struct mem_cgroup *iter; + + mutex_lock(&cgroup_mutex); + memcg->wmark_min_adj = val; + /* update hierarchical wmark_min_eadj, pre-order iteration */ + for_each_mem_cgroup_tree(iter, memcg) { + if (!mem_cgroup_online(iter)) + continue; + val = iter->wmark_min_adj; + p = parent_mem_cgroup(iter); + if (p && p->wmark_min_eadj && p->wmark_min_eadj > val) + val = p->wmark_min_eadj; + iter->wmark_min_eadj = val; + } + mutex_unlock(&cgroup_mutex); +} + +static int memory_wmark_min_adj_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + /* show the final effective value */ + seq_printf(m, "%d\n", memcg->wmark_min_eadj); + + return 0; +} + +static ssize_t memory_wmark_min_adj_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int ret, wmark_min_adj; + + buf = strstrip(buf); + ret = kstrtoint(buf, 0, &wmark_min_adj); + if (ret) + return ret; + + if (wmark_min_adj < -25 || wmark_min_adj > 50) + return -EINVAL; + + memcg_update_wmark_min_adj(memcg, wmark_min_adj); + + return nbytes; +} + +int memcg_get_wmark_min_adj(struct task_struct *curr) +{ + struct mem_cgroup *memcg; + int val; + + if (mem_cgroup_disabled()) + return 0; + + rcu_read_lock(); + memcg = mem_cgroup_from_css(task_css(curr, memory_cgrp_id)); + if (mem_cgroup_is_root(memcg)) + val = 0; + else + val = memcg->wmark_min_eadj; + rcu_read_unlock(); + + return val; +} + +/* + * Scheduled by global page allocation to be executed from the userland + * return path and throttle when free is under memcg's global WMARK_MIN. + */ +void mem_cgroup_wmark_min_throttle(void) +{ + unsigned int msec = current->wmark_min_throttle_ms; + unsigned long pflags; + + if (likely(!msec)) + return; + psi_memstall_enter(&pflags); + msleep_interruptible(msec); + psi_memstall_leave(&pflags); + current->wmark_min_throttle_ms = 0; +} + +#define WMARK_MIN_THROTTLE_MS 100UL +/* + * Tasks in memcg having positive memory.wmark_min_adj has its + * own global min watermark higher than the global WMARK_MIN: + * "WMARK_MIN + (WMARK_LOW - WMARK_MIN) * memory.wmark_min_adj" + * + * Positive memory.wmark_min_adj means low QoS requirements. When + * allocation broke memcg min watermark, it should trigger direct + * reclaim traditionally, here trigger throttle instead to further + * prevent them from disturbing others. + * + * The throttle time is simply linearly proportional to the pages + * consumed below memcg's min watermark. + * + * The base throttle time is WMARK_MIN_THROTTLE_MS, and the maximal + * throttle time is ten times WMARK_MIN_THROTTLE_MS. + * + * The actual throttling will be executed from the userland return + * path, see mem_cgroup_wmark_min_throttle(). + */ +void memcg_check_wmark_min_adj(struct task_struct *curr, + struct alloc_context *ac) +{ + struct zoneref *z; + struct zone *zone; + unsigned long wmark_min, wmark, min_low_gap, free_pages; + int wmark_min_adj = memcg_get_wmark_min_adj(curr); + + if (wmark_min_adj <= 0) + return; + + if (curr->wmark_min_throttle_ms) + return; + + z = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); + for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->high_zoneidx, ac->nodemask) { + if (cpusets_enabled() && + !__cpuset_zone_allowed(zone, __GFP_HARDWALL)) + continue; + + wmark_min = min_wmark_pages(zone); + min_low_gap = low_wmark_pages(zone) - wmark_min; + free_pages = zone_page_state(zone, NR_FREE_PAGES); + wmark = wmark_min + min_low_gap * wmark_min_adj / 100; + if (free_pages < wmark && wmark > wmark_min) { + unsigned long msec; + + /* + * The throttle time is simply linearly proportional + * to the pages consumed below memcg's min watermark. + */ + msec = (wmark - free_pages) * WMARK_MIN_THROTTLE_MS / + (wmark - wmark_min); + msec = clamp(msec, 1UL, 10 * WMARK_MIN_THROTTLE_MS); + curr->wmark_min_throttle_ms = msec; + set_notify_resume(curr); + break; + } + } +} + static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) { struct mem_cgroup_threshold_ary *t; @@ -4945,6 +5108,12 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memory_wmark_scale_factor_show, .write = memory_wmark_scale_factor_write, }, + { + .name = "wmark_min_adj", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_wmark_min_adj_show, + .write = memory_wmark_min_adj_write, + }, { .name = "force_empty", .write = mem_cgroup_force_empty_write, @@ -5319,6 +5488,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) setup_memcg_wmark(memcg); + if (parent) { + memcg->wmark_min_adj = parent->wmark_min_adj; + memcg->wmark_min_eadj = parent->wmark_min_eadj; + } + /* The following stuff does not apply to the root */ if (!parent) { root_mem_cgroup = memcg; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1db2b4dcea237dfe4947b84702899e7eb2d50fee..c346ae3c88e0ec88727868c29c76b643b187ac1a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3206,6 +3206,14 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int o; const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); + /* apply negative memory.wmark_min_adj */ + if ((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) { + int min_adj = memcg_get_wmark_min_adj(current); + + if (min_adj < 0) + min -= mark * (-min_adj) / 100; + } + /* free_pages may go negative - that's OK */ free_pages -= (1 << order) - 1; @@ -3232,6 +3240,12 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, min -= min / 4; } + /* + * Only happens due to memory.wmark_min_adj. + * Guarantee safe min after memory.wmark_min_adj? + */ + if (min < mark / 4) + min = mark / 4; #ifdef CONFIG_CMA /* If allocation can't use CMA areas don't use free CMA pages */ @@ -4387,6 +4401,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, warn_alloc(gfp_mask, ac->nodemask, "page allocation failure: order:%u", order); got_pg: + + if (ac->migratetype == MIGRATE_MOVABLE) + memcg_check_wmark_min_adj(current, ac); + return page; }