提交 6967792f 编写于 作者: Y Yang Shi 提交者: Caspar Zhang

alinux: mm: memcontrol: support background async page reclaim

Currently when memory usage exceeds memory cgroup limit, memory cgroup
just can do sync direct reclaim.  This may incur unexpected stall on
some applications which are sensitive to latency.  Introduce background
async page reclaim mechanism, like what kswapd does.

Define memcg memory usage water mark by introducing wmark_ratio interface,
which is from 0 to 100 and represents percentage of max limit.  The
wmark_high is calculated by (max * wmark_ratio / 100), the wmark_low is
(wmark_high - wmark_high >> 8), which is an empirical value.  If wmark_ratio
is 0, it means water mark is disabled, both wmark_low and wmark_high is max,
which is the default value.

If wmark_ratio is setup, when charging page, if usage is greater than
wmark_high, which means the available memory of memcg is low, a work
would be scheduled to do background page reclaim until memory usage is
reduced to wmark_low if possible.

Define a dedicated unbound workqueue for scheduling water mark reclaim
works.
Reviewed-by: NGavin Shan <shan.gavin@linux.alibaba.com>
Reviewed-by: NXunlei Pang <xlpang@linux.alibaba.com>
Signed-off-by: NYang Shi <yang.shi@linux.alibaba.com>
上级 49a3b465
......@@ -88,6 +88,12 @@ Brief summary of control files.
memory.kmem.tcp.failcnt # show the number of tcp buf memory usage hits limits
memory.kmem.tcp.max_usage_in_bytes # show max tcp buf memory usage recorded
memory.wmark_ratio # water mark ratio
memory.wmark_low # low limit (memory usage low water mark,
read-only)
memory.wmark_high # high limit (memory usge high water mark,
read-only)
1. History
The memory controller has a long history. A request for comments for the memory
......@@ -853,7 +859,20 @@ Test:
(Expect a bunch of notifications, and eventually, the oom-killer will
trigger.)
12. TODO
12. Background reclaim
The user could setup memory usage water mark by echoing a value to
memory.wmark_ratio. Valid value is from 0 to 100, which represents percentage
of max limit. The wmark_low and wmark_high would be calculated by max limit
and wmark_ratio. 0 means water mark is disabled, both wmark_low and wmark_high
would be max, which is the default value.
Once water mark is setup correctly, when charging pages to memcg, if the usage
exceeds wmark_high, which means available memory is low, a work would be
scheduled to reclaim pages in background to try to reduce memory usage to
wmark_low if possible.
13. TODO
1. Make per-cgroup scanner reclaim not-shared pages first
2. Teach controller to account for shared-pages
......
......@@ -288,6 +288,9 @@ struct mem_cgroup {
bool tcpmem_active;
int tcpmem_pressure;
unsigned int wmark_ratio;
struct work_struct wmark_work;
#ifdef CONFIG_MEMCG_KMEM
/* Index in the kmem_cache->memcg_params.memcg_caches array */
int kmemcg_id;
......@@ -793,6 +796,14 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
void mem_cgroup_split_huge_fixup(struct page *head);
#endif
static inline bool is_wmark_ok(struct mem_cgroup *memcg, bool high)
{
if (high)
return page_counter_read(&memcg->memory) < memcg->memory.wmark_high;
return page_counter_read(&memcg->memory) < memcg->memory.wmark_low;
}
#else /* CONFIG_MEMCG */
#define MEM_CGROUP_ID_SHIFT 0
......@@ -1109,6 +1120,11 @@ static inline
void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
{
}
static inline bool is_wmark_ok(struct mem_cgroup *memcg, bool low)
{
return false;
}
#endif /* CONFIG_MEMCG */
/* idx can be of type enum memcg_stat_item or node_stat_item */
......
......@@ -23,6 +23,10 @@ struct page_counter {
atomic_long_t low_usage;
atomic_long_t children_low_usage;
/* water mark low and high */
unsigned long wmark_low;
unsigned long wmark_high;
/* legacy */
unsigned long watermark;
unsigned long failcnt;
......@@ -55,6 +59,10 @@ bool page_counter_try_charge(struct page_counter *counter,
void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages);
void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages);
void page_counter_set_wmark_high(struct page_counter *counter,
unsigned long nr_pages);
void page_counter_set_wmark_low(struct page_counter *counter,
unsigned long nr_pages);
int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages);
int page_counter_memparse(const char *buf, const char *max,
unsigned long *nr_pages);
......
......@@ -95,6 +95,8 @@ int do_swap_account __read_mostly;
#define do_swap_account 0
#endif
struct workqueue_struct *memcg_wmark_wq;
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
......@@ -2156,6 +2158,34 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
return 0;
}
static void reclaim_wmark(struct mem_cgroup *memcg)
{
long nr_pages;
if (is_wmark_ok(memcg, false))
return;
nr_pages = page_counter_read(&memcg->memory) -
memcg->memory.wmark_low;
if (nr_pages <= 0)
return;
nr_pages = max(SWAP_CLUSTER_MAX, (unsigned long)nr_pages);
try_to_free_mem_cgroup_pages(memcg, nr_pages, GFP_KERNEL, true);
}
static void wmark_work_func(struct work_struct *work)
{
struct mem_cgroup *memcg;
memcg = container_of(work, struct mem_cgroup, wmark_work);
current->flags |= PF_SWAPWRITE | PF_MEMALLOC;
reclaim_wmark(memcg);
current->flags &= ~(PF_SWAPWRITE | PF_MEMALLOC);
}
static void reclaim_high(struct mem_cgroup *memcg,
unsigned int nr_pages,
gfp_t gfp_mask)
......@@ -2488,6 +2518,11 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
* reclaim, the cost of mismatch is negligible.
*/
do {
if (!is_wmark_ok(memcg, true)) {
queue_work(memcg_wmark_wq, &memcg->wmark_work);
break;
}
if (page_counter_read(&memcg->memory) > memcg->high) {
/* Don't bother a random interrupted task */
if (in_interrupt()) {
......@@ -2915,6 +2950,25 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
}
#endif
static void setup_memcg_wmark(struct mem_cgroup *memcg)
{
unsigned long high_wmark;
unsigned long low_wmark;
unsigned long max = memcg->memory.max;
unsigned int wmark_ratio = memcg->wmark_ratio;
if (wmark_ratio) {
high_wmark = (max * wmark_ratio) / 100;
low_wmark = high_wmark - (high_wmark >> 8);
page_counter_set_wmark_low(&memcg->memory, low_wmark);
page_counter_set_wmark_high(&memcg->memory, high_wmark);
} else {
page_counter_set_wmark_low(&memcg->memory, PAGE_COUNTER_MAX);
page_counter_set_wmark_high(&memcg->memory, PAGE_COUNTER_MAX);
}
}
static DEFINE_MUTEX(memcg_max_mutex);
static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
......@@ -2965,8 +3019,15 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
}
} while (true);
if (!ret && enlarge)
memcg_oom_recover(memcg);
if (!ret) {
setup_memcg_wmark(memcg);
if (!is_wmark_ok(memcg, true))
queue_work(memcg_wmark_wq, &memcg->wmark_work);
if (enlarge)
memcg_oom_recover(memcg);
}
return ret;
}
......@@ -3210,6 +3271,8 @@ enum {
RES_MAX_USAGE,
RES_FAILCNT,
RES_SOFT_LIMIT,
WMARK_HIGH_LIMIT,
WMARK_LOW_LIMIT,
};
static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
......@@ -3250,6 +3313,10 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
return counter->failcnt;
case RES_SOFT_LIMIT:
return (u64)memcg->soft_limit * PAGE_SIZE;
case WMARK_HIGH_LIMIT:
return (u64)counter->wmark_high * PAGE_SIZE;
case WMARK_LOW_LIMIT:
return (u64)counter->wmark_low * PAGE_SIZE;
default:
BUG();
}
......@@ -3695,6 +3762,43 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
return 0;
}
static int memory_wmark_ratio_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
unsigned int wmark_ratio = READ_ONCE(memcg->wmark_ratio);
seq_printf(m, "%d\n", wmark_ratio);
return 0;
}
static ssize_t memory_wmark_ratio_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
int ret, wmark_ratio;
buf = strstrip(buf);
if (!buf)
return -EINVAL;
ret = kstrtouint(buf, 0, &wmark_ratio);
if (ret)
return ret;
if (wmark_ratio > 100)
return -EINVAL;
xchg(&memcg->wmark_ratio, wmark_ratio);
setup_memcg_wmark(memcg);
if (!is_wmark_ok(memcg, true))
queue_work(memcg_wmark_wq, &memcg->wmark_work);
return nbytes;
}
static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
{
struct mem_cgroup_threshold_ary *t;
......@@ -4402,6 +4506,24 @@ static struct cftype mem_cgroup_legacy_files[] = {
.name = "stat",
.seq_show = memcg_stat_show,
},
{
.name = "wmark_ratio",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_wmark_ratio_show,
.write = memory_wmark_ratio_write,
},
{
.name = "wmark_high",
.flags = CFTYPE_NOT_ON_ROOT,
.private = MEMFILE_PRIVATE(_MEM, WMARK_HIGH_LIMIT),
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "wmark_low",
.flags = CFTYPE_NOT_ON_ROOT,
.private = MEMFILE_PRIVATE(_MEM, WMARK_LOW_LIMIT),
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "force_empty",
.write = mem_cgroup_force_empty_write,
......@@ -4663,6 +4785,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
goto fail;
INIT_WORK(&memcg->high_work, high_work_func);
INIT_WORK(&memcg->wmark_work, wmark_work_func);
memcg->last_scanned_node = MAX_NUMNODES;
INIT_LIST_HEAD(&memcg->oom_notify);
mutex_init(&memcg->thresholds_lock);
......@@ -4701,6 +4824,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (parent) {
memcg->swappiness = mem_cgroup_swappiness(parent);
memcg->oom_kill_disable = parent->oom_kill_disable;
memcg->wmark_ratio = parent->wmark_ratio;
}
if (parent && parent->use_hierarchy) {
memcg->use_hierarchy = true;
......@@ -4724,6 +4848,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
memory_cgrp_subsys.broken_hierarchy = true;
}
setup_memcg_wmark(memcg);
/* The following stuff does not apply to the root */
if (!parent) {
root_mem_cgroup = memcg;
......@@ -4784,6 +4910,9 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
page_counter_set_wmark_low(&memcg->memory, PAGE_COUNTER_MAX);
page_counter_set_wmark_high(&memcg->memory, PAGE_COUNTER_MAX);
memcg_offline_kmem(memcg);
wb_memcg_offline(memcg);
......@@ -4809,6 +4938,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
vmpressure_cleanup(&memcg->vmpressure);
cancel_work_sync(&memcg->high_work);
cancel_work_sync(&memcg->wmark_work);
mem_cgroup_remove_from_trees(memcg);
memcg_free_shrinker_maps(memcg);
memcg_free_kmem(memcg);
......@@ -4839,6 +4969,8 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
page_counter_set_wmark_low(&memcg->memory, PAGE_COUNTER_MAX);
page_counter_set_wmark_high(&memcg->memory, PAGE_COUNTER_MAX);
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
memcg_wb_domain_size_changed(memcg);
......@@ -6566,6 +6698,13 @@ static int __init mem_cgroup_init(void)
{
int cpu, node;
memcg_wmark_wq = alloc_workqueue("memcg_wmark", WQ_MEM_RECLAIM |
WQ_UNBOUND | WQ_FREEZABLE,
WQ_UNBOUND_MAX_ACTIVE);
if (!memcg_wmark_wq)
return -ENOMEM;
#ifdef CONFIG_MEMCG_KMEM
/*
* Kmem cache creation is mostly done with the slab_mutex held,
......
......@@ -236,6 +236,18 @@ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
propagate_protected_usage(c, atomic_long_read(&c->usage));
}
void page_counter_set_wmark_high(struct page_counter *counter,
unsigned long nr_pages)
{
xchg(&counter->wmark_high, nr_pages);
}
void page_counter_set_wmark_low(struct page_counter *counter,
unsigned long nr_pages)
{
xchg(&counter->wmark_low, nr_pages);
}
/**
* page_counter_memparse - memparse() for page counter limits
* @buf: string to parse
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册