提交 e6ca020b 编写于 作者: Y Yang Shi 提交者: Caspar Zhang

alinux: mm: thp: move deferred split queue to memcg's nodeinfo

The commit 87eaceb3faa59b9b4d940ec9554ce251325d83fe ("mm: thp: make
deferred split shrinker memcg aware") makes deferred split queue per
memcg to resolve memcg pre-mature OOM problem.  But, all nodes end up
sharing the same queue instead of one queue per-node before the commit.
It is not a big deal for memcg limit reclaim, but it may cause global
kswapd shrink THPs from a different node.

And, 0-day testing reported -19.6% regression of stress-ng's madvise
test [1].  I didn't see that much regression on my test box (24 threads,
48GB memory, 2 nodes), with the same test (stress-ng --timeout 1
--metrics-brief --sequential 72  --class vm --exclude spawn,exec), I saw
average -3% (run the same test 10 times then calculate the average since
the test itself may have most 15% variation according to my test)
regression sometimes (not every time, sometimes I didn't see regression
at all).

This might be caused by deferred split queue lock contention.  With some
configuration (i.e. just one root memcg) the lock contention my be worse
than before (given 2 nodes, two locks are reduced to one lock).

So, moving deferred split queue to memcg's nodeinfo to make it NUMA
aware again.

With this change stress-ng's madvise test shows average 4% improvement
sometimes and I didn't see degradation anymore.

[1]: https://lore.kernel.org/lkml/20190930084604.GC17687@shao2-debian/

Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: NYang Shi <yang.shi@linux.alibaba.com>
Reviewed-by: NXunlei Pang <xlpang@linux.alibaba.com>
上级 d651fcbb
......@@ -145,6 +145,10 @@ struct mem_cgroup_per_node {
bool dirty; /* mecg has too many dirty pages */
bool writeback; /* memcg has too many writeback */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct deferred_split deferred_split_queue;
#endif
struct mem_cgroup *memcg; /* Back pointer, we cannot */
/* use container_of */
};
......@@ -325,10 +329,6 @@ struct mem_cgroup {
struct idle_page_stats idle_stats[KIDLED_STATS_NR_TYPE];
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct deferred_split deferred_split_queue;
#endif
struct mem_cgroup_per_node *nodeinfo[0];
/* WARNING: nodeinfo must be the last member here */
};
......
......@@ -496,10 +496,11 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
static inline struct deferred_split *get_deferred_split_queue(struct page *page)
{
struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
int nid = page_to_nid(page);
struct pglist_data *pgdat = NODE_DATA(nid);
if (memcg)
return &memcg->deferred_split_queue;
return &memcg->nodeinfo[nid]->deferred_split_queue;
else
return &pgdat->deferred_split_queue;
}
......@@ -2850,12 +2851,13 @@ void deferred_split_huge_page(struct page *page)
static unsigned long deferred_split_count(struct shrinker *shrink,
struct shrink_control *sc)
{
struct pglist_data *pgdata = NODE_DATA(sc->nid);
int nid = sc->nid;
struct pglist_data *pgdata = NODE_DATA(nid);
struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
#ifdef CONFIG_MEMCG
if (sc->memcg)
ds_queue = &sc->memcg->deferred_split_queue;
ds_queue = &sc->memcg->nodeinfo[nid]->deferred_split_queue;
#endif
return READ_ONCE(ds_queue->split_queue_len);
}
......@@ -2863,7 +2865,8 @@ static unsigned long deferred_split_count(struct shrinker *shrink,
static unsigned long deferred_split_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
struct pglist_data *pgdata = NODE_DATA(sc->nid);
int nid = sc->nid;
struct pglist_data *pgdata = NODE_DATA(nid);
struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
unsigned long flags;
LIST_HEAD(list), *pos, *next;
......@@ -2872,7 +2875,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
#ifdef CONFIG_MEMCG
if (sc->memcg)
ds_queue = &sc->memcg->deferred_split_queue;
ds_queue = &sc->memcg->nodeinfo[nid]->deferred_split_queue;
#endif
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
......
......@@ -5015,6 +5015,12 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
pn->on_tree = false;
pn->memcg = memcg;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
spin_lock_init(&pn->deferred_split_queue.split_queue_lock);
INIT_LIST_HEAD(&pn->deferred_split_queue.split_queue);
pn->deferred_split_queue.split_queue_len = 0;
#endif
memcg->nodeinfo[node] = pn;
return 0;
}
......@@ -5093,11 +5099,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
INIT_LIST_HEAD(&memcg->cgwb_list);
#endif
kidled_memcg_init(memcg);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
memcg->deferred_split_queue.split_queue_len = 0;
#endif
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
return memcg;
fail:
......@@ -5434,6 +5435,8 @@ static int mem_cgroup_move_account(struct page *page,
unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret;
bool anon;
struct deferred_split *ds_queue;
int nid = page_to_nid(page);
VM_BUG_ON(from == to);
VM_BUG_ON_PAGE(PageLRU(page), page);
......@@ -5481,10 +5484,11 @@ static int mem_cgroup_move_account(struct page *page,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (compound && !list_empty(page_deferred_list(page))) {
spin_lock(&from->deferred_split_queue.split_queue_lock);
ds_queue = &from->nodeinfo[nid]->deferred_split_queue;
spin_lock(&ds_queue->split_queue_lock);
list_del_init(page_deferred_list(page));
from->deferred_split_queue.split_queue_len--;
spin_unlock(&from->deferred_split_queue.split_queue_lock);
ds_queue->split_queue_len--;
spin_unlock(&ds_queue->split_queue_lock);
}
#endif
/*
......@@ -5498,11 +5502,12 @@ static int mem_cgroup_move_account(struct page *page,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (compound && list_empty(page_deferred_list(page))) {
spin_lock(&to->deferred_split_queue.split_queue_lock);
ds_queue = &to->nodeinfo[nid]->deferred_split_queue;
spin_lock(&ds_queue->split_queue_lock);
list_add_tail(page_deferred_list(page),
&to->deferred_split_queue.split_queue);
to->deferred_split_queue.split_queue_len++;
spin_unlock(&to->deferred_split_queue.split_queue_lock);
&ds_queue->split_queue);
ds_queue->split_queue_len++;
spin_unlock(&ds_queue->split_queue_lock);
}
#endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册