From 867b477253b0ee1b48ea027d9dd9df118bdb3727 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Mon, 6 May 2019 10:34:19 +0800 Subject: [PATCH] alinux: memcg: Provide users the ability to reap zombie memcgs After memcg was deleted, page caches still reference to this memcg causing large number of dead(zombie) memcgs in the system. Then it slows down access to "/sys/fs/cgroup/cpu/memory.stat", etc due to tons of iterations, further causing various latencies. This patch introduces two ways to reclaim these zombie memcgs. 1) Background kthread reaper Introduce a kernel thread "memcg_zombie_reaper" to reclaim zombie memcgs at background periodically. Several knobs are also added to control the reaper scan frequency: - /sys/kernel/mm/memcg_reaper/scan_interval The scan period in second. Default 5s. - /sys/kernel/mm/memcg_reaper/pages_scan The scan rate of pages per scan. Default 1310720(5GiB for 4KiB page). - /sys/kernel/mm/memcg_reaper/verbose Output some zombie memcg information for debug purpose. Default off. - /sys/kernel/mm/memcg_reaper/reap_background "on/off" switch. Default "0" means off. Write "1" to switch it on. 2) One-shot trigger by users - /sys/kernel/mm/memcg_reaper/reap Write "1" to trigger one round of zombie memcg reaping, but without any guarantee, you may need to launch multiple rounds as needed. Reviewed-by: Gavin Shan Signed-off-by: Xunlei Pang --- include/linux/memcontrol.h | 4 + mm/Makefile | 2 +- mm/memcg_zombie_reaper.c | 300 +++++++++++++++++++++++++++++++++++++ mm/memcontrol.c | 4 +- 4 files changed, 308 insertions(+), 2 deletions(-) create mode 100644 mm/memcg_zombie_reaper.c diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ea5d809c46c0..8011b8ea459a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -331,6 +331,8 @@ struct mem_cgroup { struct idle_page_stats idle_stats[KIDLED_STATS_NR_TYPE]; #endif + unsigned long offline_jiffies; + ALI_HOTFIX_RESERVE(1) ALI_HOTFIX_RESERVE(2) ALI_HOTFIX_RESERVE(3) @@ -846,6 +848,8 @@ mem_cgroup_idle_page_stats_switch(struct mem_cgroup *memcg) } #endif /* CONFIG_KIDLED */ +void drain_all_stock(struct mem_cgroup *root_memcg); + static inline bool is_wmark_ok(struct mem_cgroup *memcg, bool high) { if (high) diff --git a/mm/Makefile b/mm/Makefile index 0ca4b8cd21f3..b6c34c72aeca 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -76,7 +76,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o -obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o +obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o memcg_zombie_reaper.o obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_GUP_BENCHMARK) += gup_benchmark.o diff --git a/mm/memcg_zombie_reaper.c b/mm/memcg_zombie_reaper.c new file mode 100644 index 000000000000..818167f93aa3 --- /dev/null +++ b/mm/memcg_zombie_reaper.c @@ -0,0 +1,300 @@ +/* + * Reap zombie memcgs: + * - reap at background periodically + * echo 1 > /sys/kernel/mm/memcg_reaper/reap_background + * - one-shot reap triggerred by users + * echo 1 > /sys/kernel/mm/memcg_reaper/reap + * + * Copyright (C) 2019 Alibaba + * Author: Xunlei Pang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include /* try_to_free_mem_cgroup_pages */ + +#define for_each_mem_cgroup_tree(iter, root) \ + for (iter = mem_cgroup_iter(root, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(root, iter, NULL)) + +/* Reap by kthread at background, off by default */ +static unsigned int reaper_kthread_on; +static unsigned int reaper_verbose; +static unsigned int reaper_scan_interval = 5; /* in seconds */ +/* pages one scan, 5GiB for 4KiB page size */ +static unsigned int reaper_pages_scan = 1310720; + +static DECLARE_WAIT_QUEUE_HEAD(reaper_waitq); + +#ifdef CONFIG_SYSFS +static void reap_zombie_memcgs(bool background); + +#define REAPER_ATTR(_name) \ + static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static ssize_t pages_scan_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", reaper_pages_scan); +} + +static ssize_t pages_scan_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long pages; + int err; + + err = kstrtoul(buf, 10, &pages); + if (err || pages > UINT_MAX) + return -EINVAL; + + reaper_pages_scan = pages; + + return count; +} +REAPER_ATTR(pages_scan); + +static ssize_t scan_interval_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", reaper_scan_interval); +} + +static ssize_t scan_interval_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long interval; + + err = kstrtoul(buf, 10, &interval); + if (err || interval > UINT_MAX) + return -EINVAL; + + reaper_scan_interval = interval; + + return count; +} +REAPER_ATTR(scan_interval); + +static ssize_t verbose_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", reaper_verbose); +} + +static ssize_t verbose_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long verbose; + + err = kstrtoul(buf, 10, &verbose); + if (err || (verbose != 0 && verbose != 1)) + return -EINVAL; + + reaper_verbose = verbose; + + return count; +} +REAPER_ATTR(verbose); + +static ssize_t reap_background_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", reaper_kthread_on); +} + +static ssize_t reap_background_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long enable; + + err = kstrtoul(buf, 10, &enable); + if (err || (enable != 0 && enable != 1)) + return -EINVAL; + + reaper_kthread_on = enable; + if (reaper_kthread_on) + wake_up_interruptible(&reaper_waitq); + + return count; +} +REAPER_ATTR(reap_background); + +static ssize_t reap_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", 0); +} + +static ssize_t reap_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long enable; + + err = kstrtoul(buf, 10, &enable); + if (err || enable != 1) + return -EINVAL; + + reap_zombie_memcgs(false); + + return count; +} +REAPER_ATTR(reap); + +static struct attribute *reaper_attrs[] = { + &pages_scan_attr.attr, + &scan_interval_attr.attr, + &verbose_attr.attr, + &reap_background_attr.attr, + &reap_attr.attr, + NULL, +}; + +static struct attribute_group reaper_attr_group = { + .attrs = reaper_attrs, + .name = "memcg_reaper", +}; +#endif + +static char name_buf[1024]; +static unsigned long +do_reap_zombie_memcg(struct mem_cgroup *memcg, bool background) +{ + unsigned long did_some = 0; + bool drained = false; + unsigned int jiffies_thresh = dirty_expire_interval * HZ / 100; + + /* Let dirty dying memcgs be controlled a while by writeback */ + if (background && + time_before(jiffies, memcg->offline_jiffies + jiffies_thresh) && + (memcg_page_state(memcg, NR_FILE_DIRTY) + + memcg_page_state(memcg, NR_WRITEBACK))) + return 0; + + /* try to free all pages in this cgroup */ + while (page_counter_read(&memcg->memory)) { + unsigned int ret; + + ret = try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); + did_some += ret; + if (ret) + continue; + + if (drained == false) { + drain_all_stock(memcg); + drained = true; + } else { + break; + } + } + + if (reaper_verbose) { + cgroup_name(memcg->css.cgroup, name_buf, sizeof(name_buf)); + if (page_counter_read(&memcg->memory) == 0) { + printk_ratelimited("empty zombie memcg: 0x%lx: %s\n", + (unsigned long)memcg, name_buf); + } else { + printk_ratelimited("non-empty zombie memcg: 0x%lx, counter %ld, %s\n", + (unsigned long)memcg, + page_counter_read(&memcg->memory), + name_buf); + } + } + + return did_some; +} + +static void reap_zombie_memcgs(bool background) +{ + unsigned long reclaimed; + unsigned long reclaimed_threshold; + struct mem_cgroup *iter; + + reclaimed = 0; + reclaimed_threshold = reaper_pages_scan; + for_each_mem_cgroup_tree(iter, NULL) { + if (background && reclaimed >= reclaimed_threshold) { + mem_cgroup_iter_break(NULL, iter); + break; + } + if (mem_cgroup_online(iter)) + continue; + reclaimed += do_reap_zombie_memcg(iter, background); + cond_resched(); + } + + if (background && reaper_scan_interval) + msleep_interruptible(reaper_scan_interval*1000); +} + +static int zombie_reaper_thread(void *unused) +{ + set_freezable(); + + /* Lower its priority to avoid hogging too much cpu */ + set_user_nice(current, 19); + + while (!kthread_should_stop()) { + if (reaper_kthread_on) { + reap_zombie_memcgs(true); + } else { + wait_event_freezable(reaper_waitq, + kthread_should_stop() || reaper_kthread_on); + } + + try_to_freeze(); + } + + return 0; +} + +static int __init memcg_zombie_reaper_init(void) +{ + static struct task_struct *zombie_reaper; + int err; + + zombie_reaper = kthread_run(zombie_reaper_thread, + NULL, "zombie_memcg_reaper"); + if (IS_ERR(zombie_reaper)) { + pr_err("%s: Unable to start reaper kthread\n", __func__); + return PTR_ERR(zombie_reaper); + } + +#ifdef CONFIG_SYSFS + err = sysfs_create_group(mm_kobj, &reaper_attr_group); + if (err) { + kthread_stop(zombie_reaper); + pr_err("%s: Unable to populate sysfs files\n", __func__); + return err; + } +#endif + + return 0; +} + +module_init(memcg_zombie_reaper_init); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2cd07aa1dc2f..1863abb714ae 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2072,7 +2072,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) * Drains all per-CPU charge caches for given root_memcg resp. subtree * of the hierarchy under it. */ -static void drain_all_stock(struct mem_cgroup *root_memcg) +void drain_all_stock(struct mem_cgroup *root_memcg) { int cpu, curcpu; @@ -5364,6 +5364,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_event *event, *tmp; + memcg->offline_jiffies = jiffies; + /* * Unregister events and notify userspace. * Notify userspace about cgroup removing only after rmdir of cgroup -- GitLab