提交 60be0f54 编写于 作者: X Xunlei Pang 提交者: Shile Zhang

alinux: memcg: Introduce memory.wmark_min_adj

In co-location environment, there are more or less some memory
overcommitment, then BATCH tasks may break the shared global min
watermark resulting in all types of applications falling into
the direct reclaim slow path hurting the RT of LS tasks.
(NOTE: BATCH tasks tolerate big latency spike even in seconds
as long as doesn't hurt its overal throughput. While LS tasks
are very Latency-Sensitive, they may time out or fail in case
of sudden latency spike lasts like hundreds of ms typically.)

Actually BATCH tasks are not sensitive to memory latency, they
can be assigned a strict min watermark which is different from
that of LS tasks(which can be aissgned a lenient min watermark
accordingly), thus isolating each other in case of global memory
allocation. This is kind of like the idea behind ALLOC_HARDER
for rt_task(), see gfp_to_alloc_flags().

memory.wmark_min_adj stands for memcg global WMARK_MIN adjustment,
it is used to realize separate min watermarks above-mentioned for
memcgs, its valid value is within [-25, 50], specifically:
negative value means to be relative to [0, WMARK_MIN],
positive value means to be relative to [WMARK_MIN, WMARK_LOW].
For examples,
  -25 means "WMARK_MIN + (WMARK_MIN - 0) * (-25%)"
   50 means "WMARK_MIN + (WMARK_LOW - WMARK_MIN) * 50%"

Note that the minimum -25 is what ALLOC_HARDER uses which is safe
for us to adopt, and the maximum 50 is one experienced value.

Negative memory.wmark_min_adj means high QoS requirements, it can
allocate below the global WMARK_MIN, which is kind of like the idea
behind ALLOC_HARDER, see gfp_to_alloc_flags().

Positive memory.wmark_min_adj means low QoS requirements, thus when
allocation broke memcg min watermark, it should trigger direct reclaim
traditionally, and we trigger throttle instead to further prevent
them from disturbing others.

With this interface, we can assign positive values for BATCH memcgs
and negative values for LS memcgs.

memory.wmark_min_adj default value is 0, and inherit from its parent,
Note that the final effective wmark_min_adj will consider all the
hierarchical values, its value is the maximal(most conservative)
wmark_min_adj along the hierarchy but excluding intermediate default
values(zero).
Reviewed-by: NYang Shi <yang.shi@linux.alibaba.com>
Reviewed-by: NGavin Shan <shan.gavin@linux.alibaba.com>
Signed-off-by: NXunlei Pang <xlpang@linux.alibaba.com>
上级 63442ea9
......@@ -422,6 +422,7 @@ static inline void cgroup_put(struct cgroup *cgrp)
css_put(&cgrp->self);
}
extern struct mutex cgroup_mutex;
/**
* task_css_set_check - obtain a task's css_set with extra access conditions
* @task: the task to obtain css_set for
......@@ -436,7 +437,6 @@ static inline void cgroup_put(struct cgroup *cgrp)
* as locks used during the cgroup_subsys::attach() methods.
*/
#ifdef CONFIG_PROVE_RCU
extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
#define task_css_set_check(task, __c) \
rcu_dereference_check((task)->cgroups, \
......
......@@ -72,6 +72,8 @@ struct mem_cgroup_reclaim_cookie {
unsigned int generation;
};
struct alloc_context;
#ifdef CONFIG_MEMCG
#define MEM_CGROUP_ID_SHIFT 16
......@@ -296,6 +298,9 @@ struct mem_cgroup {
bool tcpmem_active;
int tcpmem_pressure;
int wmark_min_adj; /* user-set value */
int wmark_min_eadj; /* value in effect */
unsigned int wmark_ratio;
struct work_struct wmark_work;
unsigned int wmark_scale_factor;
......@@ -550,6 +555,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
}
void mem_cgroup_handle_over_high(void);
void mem_cgroup_wmark_min_throttle(void);
unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
......@@ -858,6 +864,9 @@ static inline bool is_wmark_ok(struct mem_cgroup *memcg, bool high)
return page_counter_read(&memcg->memory) < memcg->memory.wmark_low;
}
int memcg_get_wmark_min_adj(struct task_struct *curr);
void memcg_check_wmark_min_adj(struct task_struct *curr,
struct alloc_context *ac);
#else /* CONFIG_MEMCG */
#define MEM_CGROUP_ID_SHIFT 0
......@@ -1059,6 +1068,10 @@ static inline void mem_cgroup_handle_over_high(void)
{
}
static inline void mem_cgroup_wmark_min_throttle(void)
{
}
static inline void mem_cgroup_enter_user_fault(void)
{
}
......@@ -1179,6 +1192,16 @@ static inline bool is_wmark_ok(struct mem_cgroup *memcg, bool low)
{
return false;
}
static inline int memcg_get_wmark_min_adj(struct task_struct *curr)
{
return 0;
}
static inline void memcg_check_wmark_min_adj(struct task_struct *curr,
struct alloc_context *ac)
{
}
#endif /* CONFIG_MEMCG */
/* idx can be of type enum memcg_stat_item or node_stat_item */
......
......@@ -1179,6 +1179,7 @@ struct task_struct {
/* Number of pages to reclaim on returning to userland: */
unsigned int memcg_nr_pages_over_high;
unsigned int wmark_min_throttle_ms;
/* Used by memcontrol for targeted memcg charge: */
struct mem_cgroup *active_memcg;
......
......@@ -193,6 +193,7 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
task_work_run();
mem_cgroup_handle_over_high();
mem_cgroup_wmark_min_throttle();
blkcg_maybe_throttle_current();
}
......
......@@ -34,6 +34,7 @@
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/shmem_fs.h>
......@@ -56,6 +57,7 @@
#include <linux/poll.h>
#include <linux/sort.h>
#include <linux/fs.h>
#include <linux/psi.h>
#include <linux/seq_file.h>
#include <linux/vmpressure.h>
#include <linux/mm_inline.h>
......@@ -65,7 +67,6 @@
#include <linux/lockdep.h>
#include <linux/file.h>
#include <linux/tracehook.h>
#include <linux/psi.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
......@@ -4086,6 +4087,168 @@ static ssize_t memory_wmark_scale_factor_write(struct kernfs_open_file *of,
return nbytes;
}
/*
* Figure out the maximal(most conservative) @wmark_min_adj along
* the hierarchy but excluding intermediate default zero, as the
* effective one. Example:
* root
* / \
* A D
* / \
* B C
* / \
* E F
*
* wmark_min_adj: A -10, B -25, C 0, D 50, E -25, F 50
* wmark_min_eadj: A -10, B -10, C 0, D 50, E -10, F 50
*/
static void memcg_update_wmark_min_adj(struct mem_cgroup *memcg, int val)
{
struct mem_cgroup *p;
struct mem_cgroup *iter;
mutex_lock(&cgroup_mutex);
memcg->wmark_min_adj = val;
/* update hierarchical wmark_min_eadj, pre-order iteration */
for_each_mem_cgroup_tree(iter, memcg) {
if (!mem_cgroup_online(iter))
continue;
val = iter->wmark_min_adj;
p = parent_mem_cgroup(iter);
if (p && p->wmark_min_eadj && p->wmark_min_eadj > val)
val = p->wmark_min_eadj;
iter->wmark_min_eadj = val;
}
mutex_unlock(&cgroup_mutex);
}
static int memory_wmark_min_adj_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
/* show the final effective value */
seq_printf(m, "%d\n", memcg->wmark_min_eadj);
return 0;
}
static ssize_t memory_wmark_min_adj_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
int ret, wmark_min_adj;
buf = strstrip(buf);
ret = kstrtoint(buf, 0, &wmark_min_adj);
if (ret)
return ret;
if (wmark_min_adj < -25 || wmark_min_adj > 50)
return -EINVAL;
memcg_update_wmark_min_adj(memcg, wmark_min_adj);
return nbytes;
}
int memcg_get_wmark_min_adj(struct task_struct *curr)
{
struct mem_cgroup *memcg;
int val;
if (mem_cgroup_disabled())
return 0;
rcu_read_lock();
memcg = mem_cgroup_from_css(task_css(curr, memory_cgrp_id));
if (mem_cgroup_is_root(memcg))
val = 0;
else
val = memcg->wmark_min_eadj;
rcu_read_unlock();
return val;
}
/*
* Scheduled by global page allocation to be executed from the userland
* return path and throttle when free is under memcg's global WMARK_MIN.
*/
void mem_cgroup_wmark_min_throttle(void)
{
unsigned int msec = current->wmark_min_throttle_ms;
unsigned long pflags;
if (likely(!msec))
return;
psi_memstall_enter(&pflags);
msleep_interruptible(msec);
psi_memstall_leave(&pflags);
current->wmark_min_throttle_ms = 0;
}
#define WMARK_MIN_THROTTLE_MS 100UL
/*
* Tasks in memcg having positive memory.wmark_min_adj has its
* own global min watermark higher than the global WMARK_MIN:
* "WMARK_MIN + (WMARK_LOW - WMARK_MIN) * memory.wmark_min_adj"
*
* Positive memory.wmark_min_adj means low QoS requirements. When
* allocation broke memcg min watermark, it should trigger direct
* reclaim traditionally, here trigger throttle instead to further
* prevent them from disturbing others.
*
* The throttle time is simply linearly proportional to the pages
* consumed below memcg's min watermark.
*
* The base throttle time is WMARK_MIN_THROTTLE_MS, and the maximal
* throttle time is ten times WMARK_MIN_THROTTLE_MS.
*
* The actual throttling will be executed from the userland return
* path, see mem_cgroup_wmark_min_throttle().
*/
void memcg_check_wmark_min_adj(struct task_struct *curr,
struct alloc_context *ac)
{
struct zoneref *z;
struct zone *zone;
unsigned long wmark_min, wmark, min_low_gap, free_pages;
int wmark_min_adj = memcg_get_wmark_min_adj(curr);
if (wmark_min_adj <= 0)
return;
if (curr->wmark_min_throttle_ms)
return;
z = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask);
for_next_zone_zonelist_nodemask(zone, z, ac->zonelist,
ac->high_zoneidx, ac->nodemask) {
if (cpusets_enabled() &&
!__cpuset_zone_allowed(zone, __GFP_HARDWALL))
continue;
wmark_min = min_wmark_pages(zone);
min_low_gap = low_wmark_pages(zone) - wmark_min;
free_pages = zone_page_state(zone, NR_FREE_PAGES);
wmark = wmark_min + min_low_gap * wmark_min_adj / 100;
if (free_pages < wmark && wmark > wmark_min) {
unsigned long msec;
/*
* The throttle time is simply linearly proportional
* to the pages consumed below memcg's min watermark.
*/
msec = (wmark - free_pages) * WMARK_MIN_THROTTLE_MS /
(wmark - wmark_min);
msec = clamp(msec, 1UL, 10 * WMARK_MIN_THROTTLE_MS);
curr->wmark_min_throttle_ms = msec;
set_notify_resume(curr);
break;
}
}
}
static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
{
struct mem_cgroup_threshold_ary *t;
......@@ -4945,6 +5108,12 @@ static struct cftype mem_cgroup_legacy_files[] = {
.seq_show = memory_wmark_scale_factor_show,
.write = memory_wmark_scale_factor_write,
},
{
.name = "wmark_min_adj",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_wmark_min_adj_show,
.write = memory_wmark_min_adj_write,
},
{
.name = "force_empty",
.write = mem_cgroup_force_empty_write,
......@@ -5319,6 +5488,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
setup_memcg_wmark(memcg);
if (parent) {
memcg->wmark_min_adj = parent->wmark_min_adj;
memcg->wmark_min_eadj = parent->wmark_min_eadj;
}
/* The following stuff does not apply to the root */
if (!parent) {
root_mem_cgroup = memcg;
......
......@@ -3206,6 +3206,14 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
int o;
const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
/* apply negative memory.wmark_min_adj */
if ((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) {
int min_adj = memcg_get_wmark_min_adj(current);
if (min_adj < 0)
min -= mark * (-min_adj) / 100;
}
/* free_pages may go negative - that's OK */
free_pages -= (1 << order) - 1;
......@@ -3232,6 +3240,12 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
min -= min / 4;
}
/*
* Only happens due to memory.wmark_min_adj.
* Guarantee safe min after memory.wmark_min_adj?
*/
if (min < mark / 4)
min = mark / 4;
#ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */
......@@ -4387,6 +4401,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
warn_alloc(gfp_mask, ac->nodemask,
"page allocation failure: order:%u", order);
got_pg:
if (ac->migratetype == MIGRATE_MOVABLE)
memcg_check_wmark_min_adj(current, ac);
return page;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册