memcg: support memcg sync reclaim work as kswapd

hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IMAK?from=project-issue CVE: NA -------- Since memory.high reclaim is sync whether is in interrupt, it could do more work than direct reclaim, i.e. write out dirty page, etc. So, add PF_KSWAPD flag, so that current_is_kswapd() would return true for memcg kswapd. Memcg kswapd should stop when usage of memcg fit the memcg kswapd stop flag. When the userland sets the memcg->memory.max, the stop_flag is (memcg->memory.high - memcg->memory.max * 10 / 1000), which is similar with global kswapd. Otherwise, the stop_flag is (memcg->memory.high - memcg->memory.high / 6), which is similar with most difference between watermark_low and watermark_high. And, memcg kswapd should not break memory.low protection for now. Signed-off-by: N Lu Jialin <lujialin4@huawei.com> Reviewed-by: N Kefeng Wang <wangkefeng.wang@huawei.com> Reviewed-by: N weiyang wang <wangweiyang2@huawei.com> Signed-off-by: N Zheng Zengkai <zhengzengkai@huawei.com>

memcg: support memcg sync reclaim work as kswapd
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IMAK?from=project-issue CVE: NA -------- Since memory.high reclaim is sync whether is in interrupt, it could do more work than direct reclaim, i.e. write out dirty page, etc. So, add PF_KSWAPD flag, so that current_is_kswapd() would return true for memcg kswapd. Memcg kswapd should stop when usage of memcg fit the memcg kswapd stop flag. When the userland sets the memcg->memory.max, the stop_flag is (memcg->memory.high - memcg->memory.max * 10 / 1000), which is similar with global kswapd. Otherwise, the stop_flag is (memcg->memory.high - memcg->memory.high / 6), which is similar with most difference between watermark_low and watermark_high. And, memcg kswapd should not break memory.low protection for now. Signed-off-by: N Lu Jialin <lujialin4@huawei.com> Reviewed-by: N Kefeng Wang <wangkefeng.wang@huawei.com> Reviewed-by: N weiyang wang <wangweiyang2@huawei.com> Signed-off-by: N Zheng Zengkai <zhengzengkai@huawei.com>
1496d67c · Lu Jialin · Zheng Zengkai · 6a7b3e98 · 1496d67c · 1496d67c
隐藏空白更改
内联并排

Showing with 42 addition and 2 deletion

mm/memcontrol.c mm/memcontrol.c +4 -0

mm/vmscan.c mm/vmscan.c +38 -2

未找到文件。
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2364,8 +2364,10 @@ static void high_work_func(struct work_struct *work)
 {
 	struct mem_cgroup *memcg;

+	current->flags |= PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD;
 	memcg = container_of(work, struct mem_cgroup, high_work);
 	reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
+	current->flags &= ~(PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD);
 }

 /*
@@ -2535,9 +2537,11 @@ void mem_cgroup_handle_over_high(void)
 	 * memory.high is currently batched, whereas memory.max and the page
 	 * allocator run every time an allocation is made.
 	 */
+	current->flags |= PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD;
 	nr_reclaimed = reclaim_high(memcg,
 				    in_retry ? SWAP_CLUSTER_MAX : nr_pages,
 				    GFP_KERNEL);
+	current->flags &= ~(PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD);

 	/*
 	 * memory.high is breached and reclaim is unable to keep up. Throttle

--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -61,6 +61,8 @@

 #include "internal.h"

+#define MEMCG_KSWAPD_SCATOR 10
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/vmscan.h>

@@ -2834,6 +2836,24 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
 	return inactive_lru_pages > pages_for_compaction;
 }

+static bool is_memcg_kswapd_stopped(struct scan_control *sc)
+{
+	struct mem_cgroup *memcg = sc->target_mem_cgroup;
+	bool is_stop = false;
+	unsigned long stop_flag = 0;
+
+	if (!cgroup_reclaim(sc))
+		return false;
+	if (memcg->memory.max == PAGE_COUNTER_MAX)
+		stop_flag = memcg->memory.high / 6;
+	else
+		stop_flag = memcg->memory.high - memcg->memory.max *
+			    MEMCG_KSWAPD_SCATOR / 1000;
+	is_stop = page_counter_read(&memcg->memory) < stop_flag;
+
+	return (current_is_kswapd() && is_stop);
+}
+
 static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 {
 	struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
@@ -2889,6 +2909,14 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 			   sc->nr_scanned - scanned,
 			   sc->nr_reclaimed - reclaimed);

+		/*
+		 * Memcg background reclaim would break iter once memcg kswapd
+		 * flag is satisfied.
+		 */
+		if (is_memcg_kswapd_stopped(sc)) {
+			mem_cgroup_iter_break(target_memcg, memcg);
+			break;
+		}
 	} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
 }

@@ -3257,6 +3285,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 		__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);

 	do {
+		if (is_memcg_kswapd_stopped(sc))
+			break;
+
 		vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
 				sc->priority);
 		sc->nr_scanned = 0;
@@ -3319,8 +3350,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 		goto retry;
 	}

-	/* Untapped cgroup reserves?  Don't OOM, retry. */
-	if (sc->memcg_low_skipped) {
+	/*
+	 * Untapped cgroup reserves?  Don't OOM, retry.
+	 * memcg usage is lower than memory.high / 2, memcg kswapd will lead to
+	 * stop memcg reclaim, but should not break low protection.
+	 */
+	if (sc->memcg_low_skipped &&
+	    !(current_is_kswapd() && cgroup_reclaim(sc))) {
 		sc->priority = initial_priority;
 		sc->force_deactivate = 0;
 		sc->memcg_low_reclaim = 1;