diff --git a/Documentation/cgroup-v1/memory.txt b/Documentation/cgroup-v1/memory.txt index 3682e99234c2c6652ac4990504dfb14bd3873618..51af42b1e007be0e6f550ced7313a29d366e8888 100644 --- a/Documentation/cgroup-v1/memory.txt +++ b/Documentation/cgroup-v1/memory.txt @@ -88,6 +88,12 @@ Brief summary of control files. memory.kmem.tcp.failcnt # show the number of tcp buf memory usage hits limits memory.kmem.tcp.max_usage_in_bytes # show max tcp buf memory usage recorded + memory.wmark_ratio # water mark ratio + memory.wmark_low # low limit (memory usage low water mark, + read-only) + memory.wmark_high # high limit (memory usge high water mark, + read-only) + 1. History The memory controller has a long history. A request for comments for the memory @@ -853,7 +859,20 @@ Test: (Expect a bunch of notifications, and eventually, the oom-killer will trigger.) -12. TODO +12. Background reclaim + +The user could setup memory usage water mark by echoing a value to +memory.wmark_ratio. Valid value is from 0 to 100, which represents percentage +of max limit. The wmark_low and wmark_high would be calculated by max limit +and wmark_ratio. 0 means water mark is disabled, both wmark_low and wmark_high +would be max, which is the default value. + +Once water mark is setup correctly, when charging pages to memcg, if the usage +exceeds wmark_high, which means available memory is low, a work would be +scheduled to reclaim pages in background to try to reduce memory usage to +wmark_low if possible. + +13. TODO 1. Make per-cgroup scanner reclaim not-shared pages first 2. Teach controller to account for shared-pages diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index cc6b6532eb56e75943e8cc999514dddfdc79dafe..15f9688e7deeb8190aaeada0e1150c8394b3fb05 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -288,6 +288,9 @@ struct mem_cgroup { bool tcpmem_active; int tcpmem_pressure; + unsigned int wmark_ratio; + struct work_struct wmark_work; + #ifdef CONFIG_MEMCG_KMEM /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; @@ -793,6 +796,14 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, void mem_cgroup_split_huge_fixup(struct page *head); #endif +static inline bool is_wmark_ok(struct mem_cgroup *memcg, bool high) +{ + if (high) + return page_counter_read(&memcg->memory) < memcg->memory.wmark_high; + + return page_counter_read(&memcg->memory) < memcg->memory.wmark_low; +} + #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1109,6 +1120,11 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } + +static inline bool is_wmark_ok(struct mem_cgroup *memcg, bool low) +{ + return false; +} #endif /* CONFIG_MEMCG */ /* idx can be of type enum memcg_stat_item or node_stat_item */ diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index bab7e57f659b4c03d75548c32edcb12dcab86911..10dfa378aa92a69a261f6e15ef28e69c86031883 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -23,6 +23,10 @@ struct page_counter { atomic_long_t low_usage; atomic_long_t children_low_usage; + /* water mark low and high */ + unsigned long wmark_low; + unsigned long wmark_high; + /* legacy */ unsigned long watermark; unsigned long failcnt; @@ -55,6 +59,10 @@ bool page_counter_try_charge(struct page_counter *counter, void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages); void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages); +void page_counter_set_wmark_high(struct page_counter *counter, + unsigned long nr_pages); +void page_counter_set_wmark_low(struct page_counter *counter, + unsigned long nr_pages); int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 38c6f439ff533899c981c0f99cf965902f92ae18..d02ee014ecc5cdc9af3ecf9ecd0a39ff37b896df 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -95,6 +95,8 @@ int do_swap_account __read_mostly; #define do_swap_account 0 #endif +struct workqueue_struct *memcg_wmark_wq; + /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { @@ -2156,6 +2158,34 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) return 0; } +static void reclaim_wmark(struct mem_cgroup *memcg) +{ + long nr_pages; + + if (is_wmark_ok(memcg, false)) + return; + + nr_pages = page_counter_read(&memcg->memory) - + memcg->memory.wmark_low; + if (nr_pages <= 0) + return; + + nr_pages = max(SWAP_CLUSTER_MAX, (unsigned long)nr_pages); + + try_to_free_mem_cgroup_pages(memcg, nr_pages, GFP_KERNEL, true); +} + +static void wmark_work_func(struct work_struct *work) +{ + struct mem_cgroup *memcg; + + memcg = container_of(work, struct mem_cgroup, wmark_work); + + current->flags |= PF_SWAPWRITE | PF_MEMALLOC; + reclaim_wmark(memcg); + current->flags &= ~(PF_SWAPWRITE | PF_MEMALLOC); +} + static void reclaim_high(struct mem_cgroup *memcg, unsigned int nr_pages, gfp_t gfp_mask) @@ -2488,6 +2518,11 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, * reclaim, the cost of mismatch is negligible. */ do { + if (!is_wmark_ok(memcg, true)) { + queue_work(memcg_wmark_wq, &memcg->wmark_work); + break; + } + if (page_counter_read(&memcg->memory) > memcg->high) { /* Don't bother a random interrupted task */ if (in_interrupt()) { @@ -2915,6 +2950,25 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, } #endif +static void setup_memcg_wmark(struct mem_cgroup *memcg) +{ + unsigned long high_wmark; + unsigned long low_wmark; + unsigned long max = memcg->memory.max; + unsigned int wmark_ratio = memcg->wmark_ratio; + + if (wmark_ratio) { + high_wmark = (max * wmark_ratio) / 100; + low_wmark = high_wmark - (high_wmark >> 8); + + page_counter_set_wmark_low(&memcg->memory, low_wmark); + page_counter_set_wmark_high(&memcg->memory, high_wmark); + } else { + page_counter_set_wmark_low(&memcg->memory, PAGE_COUNTER_MAX); + page_counter_set_wmark_high(&memcg->memory, PAGE_COUNTER_MAX); + } +} + static DEFINE_MUTEX(memcg_max_mutex); static int mem_cgroup_resize_max(struct mem_cgroup *memcg, @@ -2965,8 +3019,15 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, } } while (true); - if (!ret && enlarge) - memcg_oom_recover(memcg); + if (!ret) { + setup_memcg_wmark(memcg); + + if (!is_wmark_ok(memcg, true)) + queue_work(memcg_wmark_wq, &memcg->wmark_work); + + if (enlarge) + memcg_oom_recover(memcg); + } return ret; } @@ -3210,6 +3271,8 @@ enum { RES_MAX_USAGE, RES_FAILCNT, RES_SOFT_LIMIT, + WMARK_HIGH_LIMIT, + WMARK_LOW_LIMIT, }; static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, @@ -3250,6 +3313,10 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, return counter->failcnt; case RES_SOFT_LIMIT: return (u64)memcg->soft_limit * PAGE_SIZE; + case WMARK_HIGH_LIMIT: + return (u64)counter->wmark_high * PAGE_SIZE; + case WMARK_LOW_LIMIT: + return (u64)counter->wmark_low * PAGE_SIZE; default: BUG(); } @@ -3695,6 +3762,43 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, return 0; } +static int memory_wmark_ratio_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned int wmark_ratio = READ_ONCE(memcg->wmark_ratio); + + seq_printf(m, "%d\n", wmark_ratio); + + return 0; +} + +static ssize_t memory_wmark_ratio_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int ret, wmark_ratio; + + buf = strstrip(buf); + if (!buf) + return -EINVAL; + + ret = kstrtouint(buf, 0, &wmark_ratio); + if (ret) + return ret; + + if (wmark_ratio > 100) + return -EINVAL; + + xchg(&memcg->wmark_ratio, wmark_ratio); + + setup_memcg_wmark(memcg); + + if (!is_wmark_ok(memcg, true)) + queue_work(memcg_wmark_wq, &memcg->wmark_work); + + return nbytes; +} + static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) { struct mem_cgroup_threshold_ary *t; @@ -4402,6 +4506,24 @@ static struct cftype mem_cgroup_legacy_files[] = { .name = "stat", .seq_show = memcg_stat_show, }, + { + .name = "wmark_ratio", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_wmark_ratio_show, + .write = memory_wmark_ratio_write, + }, + { + .name = "wmark_high", + .flags = CFTYPE_NOT_ON_ROOT, + .private = MEMFILE_PRIVATE(_MEM, WMARK_HIGH_LIMIT), + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "wmark_low", + .flags = CFTYPE_NOT_ON_ROOT, + .private = MEMFILE_PRIVATE(_MEM, WMARK_LOW_LIMIT), + .read_u64 = mem_cgroup_read_u64, + }, { .name = "force_empty", .write = mem_cgroup_force_empty_write, @@ -4663,6 +4785,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) goto fail; INIT_WORK(&memcg->high_work, high_work_func); + INIT_WORK(&memcg->wmark_work, wmark_work_func); memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); mutex_init(&memcg->thresholds_lock); @@ -4701,6 +4824,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (parent) { memcg->swappiness = mem_cgroup_swappiness(parent); memcg->oom_kill_disable = parent->oom_kill_disable; + memcg->wmark_ratio = parent->wmark_ratio; } if (parent && parent->use_hierarchy) { memcg->use_hierarchy = true; @@ -4724,6 +4848,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) memory_cgrp_subsys.broken_hierarchy = true; } + setup_memcg_wmark(memcg); + /* The following stuff does not apply to the root */ if (!parent) { root_mem_cgroup = memcg; @@ -4784,6 +4910,9 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); + page_counter_set_wmark_low(&memcg->memory, PAGE_COUNTER_MAX); + page_counter_set_wmark_high(&memcg->memory, PAGE_COUNTER_MAX); + memcg_offline_kmem(memcg); wb_memcg_offline(memcg); @@ -4809,6 +4938,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) vmpressure_cleanup(&memcg->vmpressure); cancel_work_sync(&memcg->high_work); + cancel_work_sync(&memcg->wmark_work); mem_cgroup_remove_from_trees(memcg); memcg_free_shrinker_maps(memcg); memcg_free_kmem(memcg); @@ -4839,6 +4969,8 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); + page_counter_set_wmark_low(&memcg->memory, PAGE_COUNTER_MAX); + page_counter_set_wmark_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; memcg_wb_domain_size_changed(memcg); @@ -6566,6 +6698,13 @@ static int __init mem_cgroup_init(void) { int cpu, node; + memcg_wmark_wq = alloc_workqueue("memcg_wmark", WQ_MEM_RECLAIM | + WQ_UNBOUND | WQ_FREEZABLE, + WQ_UNBOUND_MAX_ACTIVE); + + if (!memcg_wmark_wq) + return -ENOMEM; + #ifdef CONFIG_MEMCG_KMEM /* * Kmem cache creation is mostly done with the slab_mutex held, diff --git a/mm/page_counter.c b/mm/page_counter.c index de31470655f66c3492b1858bd308eedd95917afd..ee480e6884a04cdf2178a3e67e21576854984a81 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -236,6 +236,18 @@ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) propagate_protected_usage(c, atomic_long_read(&c->usage)); } +void page_counter_set_wmark_high(struct page_counter *counter, + unsigned long nr_pages) +{ + xchg(&counter->wmark_high, nr_pages); +} + +void page_counter_set_wmark_low(struct page_counter *counter, + unsigned long nr_pages) +{ + xchg(&counter->wmark_low, nr_pages); +} + /** * page_counter_memparse - memparse() for page counter limits * @buf: string to parse