alinux: memcg: Introduce memory.wmark_min_adj

In co-location environment, there are more or less some memory overcommitment, then BATCH tasks may break the shared global min watermark resulting in all types of applications falling into the direct reclaim slow path hurting the RT of LS tasks. (NOTE: BATCH tasks tolerate big latency spike even in seconds as long as doesn't hurt its overal throughput. While LS tasks are very Latency-Sensitive, they may time out or fail in case of sudden latency spike lasts like hundreds of ms typically.) Actually BATCH tasks are not sensitive to memory latency, they can be assigned a strict min watermark which is different from that of LS tasks(which can be aissgned a lenient min watermark accordingly), thus isolating each other in case of global memory allocation. This is kind of like the idea behind ALLOC_HARDER for rt_task(), see gfp_to_alloc_flags(). memory.wmark_min_adj stands for memcg global WMARK_MIN adjustment, it is used to realize separate min watermarks above-mentioned for memcgs, its valid value is within [-25, 50], specifically: negative value means to be relative to [0, WMARK_MIN], positive value means to be relative to [WMARK_MIN, WMARK_LOW]. For examples, -25 means "WMARK_MIN + (WMARK_MIN - 0) * (-25%)" 50 means "WMARK_MIN + (WMARK_LOW - WMARK_MIN) * 50%" Note that the minimum -25 is what ALLOC_HARDER uses which is safe for us to adopt, and the maximum 50 is one experienced value. Negative memory.wmark_min_adj means high QoS requirements, it can allocate below the global WMARK_MIN, which is kind of like the idea behind ALLOC_HARDER, see gfp_to_alloc_flags(). Positive memory.wmark_min_adj means low QoS requirements, thus when allocation broke memcg min watermark, it should trigger direct reclaim traditionally, and we trigger throttle instead to further prevent them from disturbing others. With this interface, we can assign positive values for BATCH memcgs and negative values for LS memcgs. memory.wmark_min_adj default value is 0, and inherit from its parent, Note that the final effective wmark_min_adj will consider all the hierarchical values, its value is the maximal(most conservative) wmark_min_adj along the hierarchy but excluding intermediate default values(zero). Reviewed-by: N Yang Shi <yang.shi@linux.alibaba.com> Reviewed-by: N Gavin Shan <shan.gavin@linux.alibaba.com> Signed-off-by: N Xunlei Pang <xlpang@linux.alibaba.com>

alinux: memcg: Introduce memory.wmark_min_adj
In co-location environment, there are more or less some memory overcommitment, then BATCH tasks may break the shared global min watermark resulting in all types of applications falling into the direct reclaim slow path hurting the RT of LS tasks. (NOTE: BATCH tasks tolerate big latency spike even in seconds as long as doesn't hurt its overal throughput. While LS tasks are very Latency-Sensitive, they may time out or fail in case of sudden latency spike lasts like hundreds of ms typically.) Actually BATCH tasks are not sensitive to memory latency, they can be assigned a strict min watermark which is different from that of LS tasks(which can be aissgned a lenient min watermark accordingly), thus isolating each other in case of global memory allocation. This is kind of like the idea behind ALLOC_HARDER for rt_task(), see gfp_to_alloc_flags(). memory.wmark_min_adj stands for memcg global WMARK_MIN adjustment, it is used to realize separate min watermarks above-mentioned for memcgs, its valid value is within [-25, 50], specifically: negative value means to be relative to [0, WMARK_MIN], positive value means to be relative to [WMARK_MIN, WMARK_LOW]. For examples, -25 means "WMARK_MIN + (WMARK_MIN - 0) * (-25%)" 50 means "WMARK_MIN + (WMARK_LOW - WMARK_MIN) * 50%" Note that the minimum -25 is what ALLOC_HARDER uses which is safe for us to adopt, and the maximum 50 is one experienced value. Negative memory.wmark_min_adj means high QoS requirements, it can allocate below the global WMARK_MIN, which is kind of like the idea behind ALLOC_HARDER, see gfp_to_alloc_flags(). Positive memory.wmark_min_adj means low QoS requirements, thus when allocation broke memcg min watermark, it should trigger direct reclaim traditionally, and we trigger throttle instead to further prevent them from disturbing others. With this interface, we can assign positive values for BATCH memcgs and negative values for LS memcgs. memory.wmark_min_adj default value is 0, and inherit from its parent, Note that the final effective wmark_min_adj will consider all the hierarchical values, its value is the maximal(most conservative) wmark_min_adj along the hierarchy but excluding intermediate default values(zero). Reviewed-by: N Yang Shi <yang.shi@linux.alibaba.com> Reviewed-by: N Gavin Shan <shan.gavin@linux.alibaba.com> Signed-off-by: N Xunlei Pang <xlpang@linux.alibaba.com>
60be0f54 · Xunlei Pang · Shile Zhang · 63442ea9 · 60be0f54 · 60be0f54
6 changed file
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -422,6 +422,7 @@ static inline void cgroup_put(struct cgroup *cgrp)
 	css_put(&cgrp->self);
 }

+extern struct mutex cgroup_mutex;
 /**
 * task_css_set_check - obtain a task's css_set with extra access conditions
 * @task: the task to obtain css_set for
@@ -436,7 +437,6 @@ static inline void cgroup_put(struct cgroup *cgrp)
 * as locks used during the cgroup_subsys::attach() methods.
 */
 #ifdef CONFIG_PROVE_RCU
-extern struct mutex cgroup_mutex;
 extern spinlock_t css_set_lock;
 #define task_css_set_check(task, __c)					\
 	rcu_dereference_check((task)->cgroups,				\

--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -72,6 +72,8 @@ struct mem_cgroup_reclaim_cookie {
 	unsigned int generation;
 };

+struct alloc_context;
+
 #ifdef CONFIG_MEMCG

 #define MEM_CGROUP_ID_SHIFT	16
@@ -296,6 +298,9 @@ struct mem_cgroup {
 	bool			tcpmem_active;
 	int			tcpmem_pressure;

+	int			wmark_min_adj;	/* user-set value */
+	int			wmark_min_eadj;	/* value in effect */
+
 	unsigned int		wmark_ratio;
 	struct work_struct	wmark_work;
 	unsigned int		wmark_scale_factor;
@@ -550,6 +555,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
 }

 void mem_cgroup_handle_over_high(void);
+void mem_cgroup_wmark_min_throttle(void);

 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);

@@ -858,6 +864,9 @@ static inline bool is_wmark_ok(struct mem_cgroup *memcg, bool high)
 	return page_counter_read(&memcg->memory) < memcg->memory.wmark_low;
 }

+int memcg_get_wmark_min_adj(struct task_struct *curr);
+void memcg_check_wmark_min_adj(struct task_struct *curr,
+		struct alloc_context *ac);
 #else /* CONFIG_MEMCG */

 #define MEM_CGROUP_ID_SHIFT	0
@@ -1059,6 +1068,10 @@ static inline void mem_cgroup_handle_over_high(void)
 {
 }

+static inline void mem_cgroup_wmark_min_throttle(void)
+{
+}
+
 static inline void mem_cgroup_enter_user_fault(void)
 {
 }
@@ -1179,6 +1192,16 @@ static inline bool is_wmark_ok(struct mem_cgroup *memcg, bool low)
 {
 	return false;
 }
+
+static inline int memcg_get_wmark_min_adj(struct task_struct *curr)
+{
+	return 0;
+}
+
+static inline void memcg_check_wmark_min_adj(struct task_struct *curr,
+		struct alloc_context *ac)
+{
+}
 #endif /* CONFIG_MEMCG */

 /* idx can be of type enum memcg_stat_item or node_stat_item */

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1179,6 +1179,7 @@ struct task_struct {

 	/* Number of pages to reclaim on returning to userland: */
 	unsigned int			memcg_nr_pages_over_high;
+	unsigned int			wmark_min_throttle_ms;

 	/* Used by memcontrol for targeted memcg charge: */
 	struct mem_cgroup		*active_memcg;

--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -193,6 +193,7 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
 		task_work_run();

 	mem_cgroup_handle_over_high();
+	mem_cgroup_wmark_min_throttle();
 	blkcg_maybe_throttle_current();
 }


--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -34,6 +34,7 @@
 #include <linux/page_counter.h>
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
+#include <linux/cpuset.h>
 #include <linux/mm.h>
 #include <linux/sched/mm.h>
 #include <linux/shmem_fs.h>
@@ -56,6 +57,7 @@
 #include <linux/poll.h>
 #include <linux/sort.h>
 #include <linux/fs.h>
+#include <linux/psi.h>
 #include <linux/seq_file.h>
 #include <linux/vmpressure.h>
 #include <linux/mm_inline.h>
@@ -65,7 +67,6 @@
 #include <linux/lockdep.h>
 #include <linux/file.h>
 #include <linux/tracehook.h>
-#include <linux/psi.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
@@ -4086,6 +4087,168 @@ static ssize_t memory_wmark_scale_factor_write(struct kernfs_open_file *of,
 	return nbytes;
 }

+/*
+ * Figure out the maximal(most conservative) @wmark_min_adj along
+ * the hierarchy but excluding intermediate default zero, as the
+ * effective one.  Example:
+ *                      root
+ *                      / \
+ *                     A   D
+ *                    / \
+ *                   B   C
+ *                  / \
+ *                 E   F
+ *
+ * wmark_min_adj:  A -10, B -25, C 0, D 50, E -25, F 50
+ * wmark_min_eadj: A -10, B -10, C 0, D 50, E -10, F 50
+ */
+static void memcg_update_wmark_min_adj(struct mem_cgroup *memcg, int val)
+{
+	struct mem_cgroup *p;
+	struct mem_cgroup *iter;
+
+	mutex_lock(&cgroup_mutex);
+	memcg->wmark_min_adj = val;
+	/* update hierarchical wmark_min_eadj, pre-order iteration */
+	for_each_mem_cgroup_tree(iter, memcg) {
+		if (!mem_cgroup_online(iter))
+			continue;
+		val = iter->wmark_min_adj;
+		p = parent_mem_cgroup(iter);
+		if (p && p->wmark_min_eadj && p->wmark_min_eadj > val)
+			val = p->wmark_min_eadj;
+		iter->wmark_min_eadj = val;
+	}
+	mutex_unlock(&cgroup_mutex);
+}
+
+static int memory_wmark_min_adj_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+	/* show the final effective value */
+	seq_printf(m, "%d\n", memcg->wmark_min_eadj);
+
+	return 0;
+}
+
+static ssize_t memory_wmark_min_adj_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	int ret, wmark_min_adj;
+
+	buf = strstrip(buf);
+	ret = kstrtoint(buf, 0, &wmark_min_adj);
+	if (ret)
+		return ret;
+
+	if (wmark_min_adj < -25 || wmark_min_adj > 50)
+		return -EINVAL;
+
+	memcg_update_wmark_min_adj(memcg, wmark_min_adj);
+
+	return nbytes;
+}
+
+int memcg_get_wmark_min_adj(struct task_struct *curr)
+{
+	struct mem_cgroup *memcg;
+	int val;
+
+	if (mem_cgroup_disabled())
+		return 0;
+
+	rcu_read_lock();
+	memcg = mem_cgroup_from_css(task_css(curr, memory_cgrp_id));
+	if (mem_cgroup_is_root(memcg))
+		val = 0;
+	else
+		val = memcg->wmark_min_eadj;
+	rcu_read_unlock();
+
+	return val;
+}
+
+/*
+ * Scheduled by global page allocation to be executed from the userland
+ * return path and throttle when free is under memcg's global WMARK_MIN.
+ */
+void mem_cgroup_wmark_min_throttle(void)
+{
+	unsigned int msec = current->wmark_min_throttle_ms;
+	unsigned long pflags;
+
+	if (likely(!msec))
+		return;
+	psi_memstall_enter(&pflags);
+	msleep_interruptible(msec);
+	psi_memstall_leave(&pflags);
+	current->wmark_min_throttle_ms = 0;
+}
+
+#define WMARK_MIN_THROTTLE_MS 100UL
+/*
+ * Tasks in memcg having positive memory.wmark_min_adj has its
+ * own global min watermark higher than the global WMARK_MIN:
+ * "WMARK_MIN + (WMARK_LOW - WMARK_MIN) * memory.wmark_min_adj"
+ *
+ * Positive memory.wmark_min_adj means low QoS requirements. When
+ * allocation broke memcg min watermark, it should trigger direct
+ * reclaim traditionally, here trigger throttle instead to further
+ * prevent them from disturbing others.
+ *
+ * The throttle time is simply linearly proportional to the pages
+ * consumed below memcg's min watermark.
+ *
+ * The base throttle time is WMARK_MIN_THROTTLE_MS, and the maximal
+ * throttle time is ten times WMARK_MIN_THROTTLE_MS.
+ *
+ * The actual throttling will be executed from the userland return
+ * path, see mem_cgroup_wmark_min_throttle().
+ */
+void memcg_check_wmark_min_adj(struct task_struct *curr,
+		struct alloc_context *ac)
+{
+	struct zoneref *z;
+	struct zone *zone;
+	unsigned long wmark_min, wmark, min_low_gap, free_pages;
+	int wmark_min_adj = memcg_get_wmark_min_adj(curr);
+
+	if (wmark_min_adj <= 0)
+		return;
+
+	if (curr->wmark_min_throttle_ms)
+		return;
+
+	z = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask);
+	for_next_zone_zonelist_nodemask(zone, z, ac->zonelist,
+			ac->high_zoneidx, ac->nodemask) {
+		if (cpusets_enabled() &&
+		    !__cpuset_zone_allowed(zone, __GFP_HARDWALL))
+			continue;
+
+		wmark_min = min_wmark_pages(zone);
+		min_low_gap = low_wmark_pages(zone) - wmark_min;
+		free_pages = zone_page_state(zone, NR_FREE_PAGES);
+		wmark = wmark_min + min_low_gap * wmark_min_adj / 100;
+		if (free_pages < wmark && wmark > wmark_min) {
+			unsigned long msec;
+
+			/*
+			 * The throttle time is simply linearly proportional
+			 * to the pages consumed below memcg's min watermark.
+			 */
+			msec = (wmark - free_pages) * WMARK_MIN_THROTTLE_MS /
+					(wmark - wmark_min);
+			msec = clamp(msec, 1UL, 10 * WMARK_MIN_THROTTLE_MS);
+			curr->wmark_min_throttle_ms = msec;
+			set_notify_resume(curr);
+			break;
+		}
+	}
+}
+
 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
 {
 	struct mem_cgroup_threshold_ary *t;
@@ -4945,6 +5108,12 @@ static struct cftype mem_cgroup_legacy_files[] = {
 		.seq_show = memory_wmark_scale_factor_show,
 		.write = memory_wmark_scale_factor_write,
 	},
+	{
+		.name = "wmark_min_adj",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_wmark_min_adj_show,
+		.write = memory_wmark_min_adj_write,
+	},
 	{
 		.name = "force_empty",
 		.write = mem_cgroup_force_empty_write,
@@ -5319,6 +5488,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)

 	setup_memcg_wmark(memcg);

+	if (parent) {
+		memcg->wmark_min_adj = parent->wmark_min_adj;
+		memcg->wmark_min_eadj = parent->wmark_min_eadj;
+	}
+
 	/* The following stuff does not apply to the root */
 	if (!parent) {
 		root_mem_cgroup = memcg;

--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3206,6 +3206,14 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 	int o;
 	const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));

+	/* apply negative memory.wmark_min_adj */
+	if ((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) {
+		int min_adj = memcg_get_wmark_min_adj(current);
+
+		if (min_adj < 0)
+			min -= mark * (-min_adj) / 100;
+	}
+
 	/* free_pages may go negative - that's OK */
 	free_pages -= (1 << order) - 1;

@@ -3232,6 +3240,12 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 			min -= min / 4;
 	}

+	/*
+	 * Only happens due to memory.wmark_min_adj.
+	 * Guarantee safe min after memory.wmark_min_adj?
+	 */
+	if (min < mark / 4)
+		min = mark / 4;

 #ifdef CONFIG_CMA
 	/* If allocation can't use CMA areas don't use free CMA pages */
@@ -4387,6 +4401,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	warn_alloc(gfp_mask, ac->nodemask,
 			"page allocation failure: order:%u", order);
 got_pg:
+
+	if (ac->migratetype == MIGRATE_MOVABLE)
+		memcg_check_wmark_min_adj(current, ac);
+
 	return page;
 }