From 40969475355ab785836e782f07994df5cb8404af Mon Sep 17 00:00:00 2001
From: Xu Yu <xuyu@linux.alibaba.com>
Date: Sat, 28 Dec 2019 01:10:10 +0800
Subject: [PATCH] alinux: mm, memcg: record latency of memcg wmark reclaim

The memcg background async page reclaim, a.k.a, memcg kswapd, is
implemented with a dedicated unbound workqueue currently.

However, memcg kswapd will run too frequently, resulting in high
overhead, page cache thrashing, frequent dirty page writeback, etc., due
to improper memcg memory.wmark_ratio, unreasonable memcg memor capacity,
or even abnormal memcg memory usage.

We need to find out the problematic memcg(s) where memcg kswapd
introduces significant overhead.

This records the latency of each run of memcg kswapd work, and then
aggregates into the exstat of per memcg.

Signed-off-by: Xu Yu <xuyu@linux.alibaba.com>
Reviewed-by: Xunlei Pang <xlpang@linux.alibaba.com>
---
 Documentation/alibaba/interfaces.rst |  4 +++
 include/linux/memcontrol.h           |  1 +
 mm/memcontrol.c                      | 39 +++++++++++++++++++++++-----
 3 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/Documentation/alibaba/interfaces.rst b/Documentation/alibaba/interfaces.rst
index 67ef70c65d74..b957e493065c 100644
--- a/Documentation/alibaba/interfaces.rst
+++ b/Documentation/alibaba/interfaces.rst
@@ -85,6 +85,10 @@ memory.exstat
     "wmark_min_throttled_ms" field is the total throttled time in milliseconds
     due to positive memory.wmark_min_adj under global memory pressure.
 
+    "wmark_reclaim_work_ms" field is the total background async page reclaim
+    (a.k.a, memcg kswap) work time in milliseconds, including sleep/resched
+    time currently, due to excessive usage of memory over wmark_high.
+
 zombie memcgs reaper
 ====================
     After memcg was deleted, page caches still reference to this memcg
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9b99df809590..8d0950810f85 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -54,6 +54,7 @@ enum memcg_stat_item {
 
 enum memcg_exstat_item {
 	MEMCG_WMARK_MIN,
+	MEMCG_WMARK_RECLAIM,
 	MEMCG_NR_EXSTAT,
 };
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 66037ae2edc7..e663acde0384 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2313,6 +2313,8 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
 static void reclaim_wmark(struct mem_cgroup *memcg)
 {
 	long nr_pages;
+	struct mem_cgroup *iter;
+	u64 start, duration;
 
 	if (is_wmark_ok(memcg, false))
 		return;
@@ -2324,7 +2326,21 @@ static void reclaim_wmark(struct mem_cgroup *memcg)
 
 	nr_pages = max(SWAP_CLUSTER_MAX, (unsigned long)nr_pages);
 
+	/*
+	 * Typically, we would like to record the actual cpu% of reclaim_wmark
+	 * work, excluding any sleep/resched time.  However, currently we just
+	 * simply record the whole duration of reclaim_wmark work for the
+	 * overhead-accuracy trade-off.
+	 */
+	start = ktime_get_ns();
 	try_to_free_mem_cgroup_pages(memcg, nr_pages, GFP_KERNEL, true);
+	duration = ktime_get_ns() - start;
+
+	css_get(&memcg->css);
+	for (iter = memcg; iter; iter = parent_mem_cgroup(iter))
+		this_cpu_add(iter->exstat_cpu->item[MEMCG_WMARK_RECLAIM],
+			     duration);
+	css_put(&memcg->css);
 }
 
 static void wmark_work_func(struct work_struct *work)
@@ -4167,17 +4183,26 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 	return 0;
 }
 
+static u64 memcg_exstat_gather(struct mem_cgroup *memcg,
+			       enum memcg_exstat_item idx)
+{
+	u64 sum = 0;
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		sum += per_cpu_ptr(memcg->exstat_cpu, cpu)->item[idx];
+
+	return sum;
+}
+
 static int memcg_exstat_show(struct seq_file *m, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-	u64 wmark_min = 0;
-	int cpu;
 
-	for_each_possible_cpu(cpu) {
-		wmark_min +=
-		per_cpu_ptr(memcg->exstat_cpu, cpu)->item[MEMCG_WMARK_MIN];
-	}
-	seq_printf(m, "wmark_min_throttled_ms %llu\n", wmark_min);
+	seq_printf(m, "wmark_min_throttled_ms %llu\n",
+		   memcg_exstat_gather(memcg, MEMCG_WMARK_MIN));
+	seq_printf(m, "wmark_reclaim_work_ms %llu\n",
+		   memcg_exstat_gather(memcg, MEMCG_WMARK_RECLAIM) / 1000000);
 
 	return 0;
 }
-- 
GitLab