From 6b2ef082765d69c6946d1c4279db3c78708074dd Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 14 Aug 2019 03:11:42 +0800 Subject: [PATCH] alios: mm: memcontrol: support background async page reclaim Currently when memory usage exceeds memory cgroup limit, memory cgroup just can do sync direct reclaim. This may incur unexpected stall on some applications which are sensitive to latency. Introduce background async page reclaim mechanism, like what kswapd does. Define memcg memory usage water mark by introducing wmark_ratio interface, which is from 0 to 100 and represents percentage of max limit. The wmark_high is calculated by (max * wmark_ratio / 100), the wmark_low is (wmark_high - wmark_high >> 8), which is an empirical value. If wmark_ratio is 0, it means water mark is disabled, both wmark_low and wmark_high is max, which is the default value. If wmark_ratio is setup, when charging page, if usage is greater than wmark_high, which means the available memory of memcg is low, a work would be scheduled to do background page reclaim until memory usage is reduced to wmark_low if possible. Define a dedicated unbound workqueue for scheduling water mark reclaim works. Reviewed-by: Gavin Shan Reviewed-by: Xunlei Pang Signed-off-by: Yang Shi --- Documentation/cgroup-v1/memory.txt | 21 ++++- include/linux/memcontrol.h | 16 ++++ include/linux/page_counter.h | 8 ++ mm/memcontrol.c | 143 ++++++++++++++++++++++++++++- mm/page_counter.c | 12 +++ 5 files changed, 197 insertions(+), 3 deletions(-) diff --git a/Documentation/cgroup-v1/memory.txt b/Documentation/cgroup-v1/memory.txt index 3682e99234c2..51af42b1e007 100644 --- a/Documentation/cgroup-v1/memory.txt +++ b/Documentation/cgroup-v1/memory.txt @@ -88,6 +88,12 @@ Brief summary of control files. memory.kmem.tcp.failcnt # show the number of tcp buf memory usage hits limits memory.kmem.tcp.max_usage_in_bytes # show max tcp buf memory usage recorded + memory.wmark_ratio # water mark ratio + memory.wmark_low # low limit (memory usage low water mark, + read-only) + memory.wmark_high # high limit (memory usge high water mark, + read-only) + 1. History The memory controller has a long history. A request for comments for the memory @@ -853,7 +859,20 @@ Test: (Expect a bunch of notifications, and eventually, the oom-killer will trigger.) -12. TODO +12. Background reclaim + +The user could setup memory usage water mark by echoing a value to +memory.wmark_ratio. Valid value is from 0 to 100, which represents percentage +of max limit. The wmark_low and wmark_high would be calculated by max limit +and wmark_ratio. 0 means water mark is disabled, both wmark_low and wmark_high +would be max, which is the default value. + +Once water mark is setup correctly, when charging pages to memcg, if the usage +exceeds wmark_high, which means available memory is low, a work would be +scheduled to reclaim pages in background to try to reduce memory usage to +wmark_low if possible. + +13. TODO 1. Make per-cgroup scanner reclaim not-shared pages first 2. Teach controller to account for shared-pages diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index cc6b6532eb56..15f9688e7dee 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -288,6 +288,9 @@ struct mem_cgroup { bool tcpmem_active; int tcpmem_pressure; + unsigned int wmark_ratio; + struct work_struct wmark_work; + #ifdef CONFIG_MEMCG_KMEM /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; @@ -793,6 +796,14 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, void mem_cgroup_split_huge_fixup(struct page *head); #endif +static inline bool is_wmark_ok(struct mem_cgroup *memcg, bool high) +{ + if (high) + return page_counter_read(&memcg->memory) < memcg->memory.wmark_high; + + return page_counter_read(&memcg->memory) < memcg->memory.wmark_low; +} + #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1109,6 +1120,11 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } + +static inline bool is_wmark_ok(struct mem_cgroup *memcg, bool low) +{ + return false; +} #endif /* CONFIG_MEMCG */ /* idx can be of type enum memcg_stat_item or node_stat_item */ diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index bab7e57f659b..10dfa378aa92 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -23,6 +23,10 @@ struct page_counter { atomic_long_t low_usage; atomic_long_t children_low_usage; + /* water mark low and high */ + unsigned long wmark_low; + unsigned long wmark_high; + /* legacy */ unsigned long watermark; unsigned long failcnt; @@ -55,6 +59,10 @@ bool page_counter_try_charge(struct page_counter *counter, void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages); void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages); +void page_counter_set_wmark_high(struct page_counter *counter, + unsigned long nr_pages); +void page_counter_set_wmark_low(struct page_counter *counter, + unsigned long nr_pages); int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dfd98ff2a3f0..d516877c2f6e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -95,6 +95,8 @@ int do_swap_account __read_mostly; #define do_swap_account 0 #endif +struct workqueue_struct *memcg_wmark_wq; + /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { @@ -2156,6 +2158,34 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) return 0; } +static void reclaim_wmark(struct mem_cgroup *memcg) +{ + long nr_pages; + + if (is_wmark_ok(memcg, false)) + return; + + nr_pages = page_counter_read(&memcg->memory) - + memcg->memory.wmark_low; + if (nr_pages <= 0) + return; + + nr_pages = max(SWAP_CLUSTER_MAX, (unsigned long)nr_pages); + + try_to_free_mem_cgroup_pages(memcg, nr_pages, GFP_KERNEL, true); +} + +static void wmark_work_func(struct work_struct *work) +{ + struct mem_cgroup *memcg; + + memcg = container_of(work, struct mem_cgroup, wmark_work); + + current->flags |= PF_SWAPWRITE | PF_MEMALLOC; + reclaim_wmark(memcg); + current->flags &= ~(PF_SWAPWRITE | PF_MEMALLOC); +} + static void reclaim_high(struct mem_cgroup *memcg, unsigned int nr_pages, gfp_t gfp_mask) @@ -2479,6 +2509,11 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, * reclaim, the cost of mismatch is negligible. */ do { + if (!is_wmark_ok(memcg, true)) { + queue_work(memcg_wmark_wq, &memcg->wmark_work); + break; + } + if (page_counter_read(&memcg->memory) > memcg->high) { /* Don't bother a random interrupted task */ if (in_interrupt()) { @@ -2906,6 +2941,25 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, } #endif +static void setup_memcg_wmark(struct mem_cgroup *memcg) +{ + unsigned long high_wmark; + unsigned long low_wmark; + unsigned long max = memcg->memory.max; + unsigned int wmark_ratio = memcg->wmark_ratio; + + if (wmark_ratio) { + high_wmark = (max * wmark_ratio) / 100; + low_wmark = high_wmark - (high_wmark >> 8); + + page_counter_set_wmark_low(&memcg->memory, low_wmark); + page_counter_set_wmark_high(&memcg->memory, high_wmark); + } else { + page_counter_set_wmark_low(&memcg->memory, PAGE_COUNTER_MAX); + page_counter_set_wmark_high(&memcg->memory, PAGE_COUNTER_MAX); + } +} + static DEFINE_MUTEX(memcg_max_mutex); static int mem_cgroup_resize_max(struct mem_cgroup *memcg, @@ -2956,8 +3010,15 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, } } while (true); - if (!ret && enlarge) - memcg_oom_recover(memcg); + if (!ret) { + setup_memcg_wmark(memcg); + + if (!is_wmark_ok(memcg, true)) + queue_work(memcg_wmark_wq, &memcg->wmark_work); + + if (enlarge) + memcg_oom_recover(memcg); + } return ret; } @@ -3201,6 +3262,8 @@ enum { RES_MAX_USAGE, RES_FAILCNT, RES_SOFT_LIMIT, + WMARK_HIGH_LIMIT, + WMARK_LOW_LIMIT, }; static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, @@ -3241,6 +3304,10 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, return counter->failcnt; case RES_SOFT_LIMIT: return (u64)memcg->soft_limit * PAGE_SIZE; + case WMARK_HIGH_LIMIT: + return (u64)counter->wmark_high * PAGE_SIZE; + case WMARK_LOW_LIMIT: + return (u64)counter->wmark_low * PAGE_SIZE; default: BUG(); } @@ -3686,6 +3753,43 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, return 0; } +static int memory_wmark_ratio_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned int wmark_ratio = READ_ONCE(memcg->wmark_ratio); + + seq_printf(m, "%d\n", wmark_ratio); + + return 0; +} + +static ssize_t memory_wmark_ratio_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int ret, wmark_ratio; + + buf = strstrip(buf); + if (!buf) + return -EINVAL; + + ret = kstrtouint(buf, 0, &wmark_ratio); + if (ret) + return ret; + + if (wmark_ratio > 100) + return -EINVAL; + + xchg(&memcg->wmark_ratio, wmark_ratio); + + setup_memcg_wmark(memcg); + + if (!is_wmark_ok(memcg, true)) + queue_work(memcg_wmark_wq, &memcg->wmark_work); + + return nbytes; +} + static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) { struct mem_cgroup_threshold_ary *t; @@ -4393,6 +4497,24 @@ static struct cftype mem_cgroup_legacy_files[] = { .name = "stat", .seq_show = memcg_stat_show, }, + { + .name = "wmark_ratio", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_wmark_ratio_show, + .write = memory_wmark_ratio_write, + }, + { + .name = "wmark_high", + .flags = CFTYPE_NOT_ON_ROOT, + .private = MEMFILE_PRIVATE(_MEM, WMARK_HIGH_LIMIT), + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "wmark_low", + .flags = CFTYPE_NOT_ON_ROOT, + .private = MEMFILE_PRIVATE(_MEM, WMARK_LOW_LIMIT), + .read_u64 = mem_cgroup_read_u64, + }, { .name = "force_empty", .write = mem_cgroup_force_empty_write, @@ -4654,6 +4776,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) goto fail; INIT_WORK(&memcg->high_work, high_work_func); + INIT_WORK(&memcg->wmark_work, wmark_work_func); memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); mutex_init(&memcg->thresholds_lock); @@ -4692,6 +4815,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (parent) { memcg->swappiness = mem_cgroup_swappiness(parent); memcg->oom_kill_disable = parent->oom_kill_disable; + memcg->wmark_ratio = parent->wmark_ratio; } if (parent && parent->use_hierarchy) { memcg->use_hierarchy = true; @@ -4715,6 +4839,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) memory_cgrp_subsys.broken_hierarchy = true; } + setup_memcg_wmark(memcg); + /* The following stuff does not apply to the root */ if (!parent) { root_mem_cgroup = memcg; @@ -4775,6 +4901,9 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); + page_counter_set_wmark_low(&memcg->memory, PAGE_COUNTER_MAX); + page_counter_set_wmark_high(&memcg->memory, PAGE_COUNTER_MAX); + memcg_offline_kmem(memcg); wb_memcg_offline(memcg); @@ -4800,6 +4929,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) vmpressure_cleanup(&memcg->vmpressure); cancel_work_sync(&memcg->high_work); + cancel_work_sync(&memcg->wmark_work); mem_cgroup_remove_from_trees(memcg); memcg_free_shrinker_maps(memcg); memcg_free_kmem(memcg); @@ -4830,6 +4960,8 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); + page_counter_set_wmark_low(&memcg->memory, PAGE_COUNTER_MAX); + page_counter_set_wmark_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; memcg_wb_domain_size_changed(memcg); @@ -6557,6 +6689,13 @@ static int __init mem_cgroup_init(void) { int cpu, node; + memcg_wmark_wq = alloc_workqueue("memcg_wmark", WQ_MEM_RECLAIM | + WQ_UNBOUND | WQ_FREEZABLE, + WQ_UNBOUND_MAX_ACTIVE); + + if (!memcg_wmark_wq) + return -ENOMEM; + #ifdef CONFIG_MEMCG_KMEM /* * Kmem cache creation is mostly done with the slab_mutex held, diff --git a/mm/page_counter.c b/mm/page_counter.c index de31470655f6..ee480e6884a0 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -236,6 +236,18 @@ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) propagate_protected_usage(c, atomic_long_read(&c->usage)); } +void page_counter_set_wmark_high(struct page_counter *counter, + unsigned long nr_pages) +{ + xchg(&counter->wmark_high, nr_pages); +} + +void page_counter_set_wmark_low(struct page_counter *counter, + unsigned long nr_pages) +{ + xchg(&counter->wmark_low, nr_pages); +} + /** * page_counter_memparse - memparse() for page counter limits * @buf: string to parse -- GitLab