alinux: mm, memcg: record latency of memcg wmark reclaim

The memcg background async page reclaim, a.k.a, memcg kswapd, is implemented with a dedicated unbound workqueue currently. However, memcg kswapd will run too frequently, resulting in high overhead, page cache thrashing, frequent dirty page writeback, etc., due to improper memcg memory.wmark_ratio, unreasonable memcg memor capacity, or even abnormal memcg memory usage. We need to find out the problematic memcg(s) where memcg kswapd introduces significant overhead. This records the latency of each run of memcg kswapd work, and then aggregates into the exstat of per memcg. Signed-off-by: N Xu Yu <xuyu@linux.alibaba.com> Reviewed-by: N Xunlei Pang <xlpang@linux.alibaba.com>

alinux: mm, memcg: record latency of memcg wmark reclaim
The memcg background async page reclaim, a.k.a, memcg kswapd, is implemented with a dedicated unbound workqueue currently. However, memcg kswapd will run too frequently, resulting in high overhead, page cache thrashing, frequent dirty page writeback, etc., due to improper memcg memory.wmark_ratio, unreasonable memcg memor capacity, or even abnormal memcg memory usage. We need to find out the problematic memcg(s) where memcg kswapd introduces significant overhead. This records the latency of each run of memcg kswapd work, and then aggregates into the exstat of per memcg. Signed-off-by: N Xu Yu <xuyu@linux.alibaba.com> Reviewed-by: N Xunlei Pang <xlpang@linux.alibaba.com>
3ec93e1a · Xu Yu · 87bac306 · 3ec93e1a · 3ec93e1a · 3ec93e1a
隐藏空白更改
内联并排

Showing with 37 addition and 7 deletion

Documentation/alibaba/interfaces.rst Documentation/alibaba/interfaces.rst +4 -0

include/linux/memcontrol.h include/linux/memcontrol.h +1 -0

mm/memcontrol.c mm/memcontrol.c +32 -7

未找到文件。
--- a/Documentation/alibaba/interfaces.rst
+++ b/Documentation/alibaba/interfaces.rst
@@ -85,6 +85,10 @@ memory.exstat
    "wmark_min_throttled_ms" field is the total throttled time in milliseconds
    due to positive memory.wmark_min_adj under global memory pressure.
+    "wmark_reclaim_work_ms" field is the total background async page reclaim
+    (a.k.a, memcg kswap) work time in milliseconds, including sleep/resched
+    time currently, due to excessive usage of memory over wmark_high.
 zombie memcgs reaper
 ====================
    After memcg was deleted, page caches still reference to this memcg

--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -54,6 +54,7 @@ enum memcg_stat_item {
 enum memcg_exstat_item {
 	MEMCG_WMARK_MIN,
+	MEMCG_WMARK_RECLAIM,
 	MEMCG_NR_EXSTAT,
 };

--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2313,6 +2313,8 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
 static void reclaim_wmark(struct mem_cgroup *memcg)
 {
 	long nr_pages;
+	struct mem_cgroup *iter;
+	u64 start, duration;
 	if (is_wmark_ok(memcg, false))
 		return;
@@ -2324,7 +2326,21 @@ static void reclaim_wmark(struct mem_cgroup *memcg)
 	nr_pages = max(SWAP_CLUSTER_MAX, (unsigned long)nr_pages);
+	/*
+	 * Typically, we would like to record the actual cpu% of reclaim_wmark
+	 * work, excluding any sleep/resched time.  However, currently we just
+	 * simply record the whole duration of reclaim_wmark work for the
+	 * overhead-accuracy trade-off.
+	 */
+	start = ktime_get_ns();
 	try_to_free_mem_cgroup_pages(memcg, nr_pages, GFP_KERNEL, true);
+	duration = ktime_get_ns() - start;
+	css_get(&memcg->css);
+	for (iter = memcg; iter; iter = parent_mem_cgroup(iter))
+		this_cpu_add(iter->exstat_cpu->item[MEMCG_WMARK_RECLAIM],
+			     duration);
+	css_put(&memcg->css);
 }
 static void wmark_work_func(struct work_struct *work)
@@ -4167,17 +4183,26 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 	return 0;
 }
+static u64 memcg_exstat_gather(struct mem_cgroup *memcg,
+			       enum memcg_exstat_item idx)
+{
+	u64 sum = 0;
+	int cpu;
+	for_each_online_cpu(cpu)
+		sum += per_cpu_ptr(memcg->exstat_cpu, cpu)->item[idx];
+	return sum;
+}
 static int memcg_exstat_show(struct seq_file *m, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-	u64 wmark_min = 0;
-	int cpu;
-	for_each_possible_cpu(cpu) {
+	seq_printf(m, "wmark_min_throttled_ms %llu\n",
-		wmark_min +=
+		   memcg_exstat_gather(memcg, MEMCG_WMARK_MIN));
-		per_cpu_ptr(memcg->exstat_cpu, cpu)->item[MEMCG_WMARK_MIN];
+	seq_printf(m, "wmark_reclaim_work_ms %llu\n",
-	}
+		   memcg_exstat_gather(memcg, MEMCG_WMARK_RECLAIM) / 1000000);
-	seq_printf(m, "wmark_min_throttled_ms %llu\n", wmark_min);
 	return 0;
 }