提交 8de7ecc6 编写于 作者: S Shakeel Butt 提交者: Linus Torvalds

memcg: reduce memcg tree traversals for stats collection

Currently cgroup-v1's memcg_stat_show traverses the memcg tree ~17 times
to collect the stats while cgroup-v2's memory_stat_show traverses the
memcg tree thrice.  On a large machine, a couple thousand memcgs is very
normal and if the churn is high and memcgs stick around during to several
reasons, tens of thousands of nodes in memcg tree can exist.  This patch
has refactored and shared the stat collection code between cgroup-v1 and
cgroup-v2 and has reduced the tree traversal to just one.

I ran a simple benchmark which reads the root_mem_cgroup's stat file
1000 times in the presense of 2500 memcgs on cgroup-v1. The results are:

Without the patch:
$ time ./read-root-stat-1000-times

real    0m1.663s
user    0m0.000s
sys     0m1.660s

With the patch:
$ time ./read-root-stat-1000-times

real    0m0.468s
user    0m0.000s
sys     0m0.467s

Link: http://lkml.kernel.org/r/20180724224635.143944-1-shakeelb@google.comSigned-off-by: NShakeel Butt <shakeelb@google.com>
Acked-by: NMichal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Bruce Merry <bmerry@ska.ac.za>
Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
上级 1c4c3b99
...@@ -2899,29 +2899,34 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, ...@@ -2899,29 +2899,34 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
return retval; return retval;
} }
static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat) struct accumulated_stats {
{ unsigned long stat[MEMCG_NR_STAT];
struct mem_cgroup *iter; unsigned long events[NR_VM_EVENT_ITEMS];
int i; unsigned long lru_pages[NR_LRU_LISTS];
const unsigned int *stats_array;
memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT); const unsigned int *events_array;
int stats_size;
for_each_mem_cgroup_tree(iter, memcg) { int events_size;
for (i = 0; i < MEMCG_NR_STAT; i++) };
stat[i] += memcg_page_state(iter, i);
}
}
static void tree_events(struct mem_cgroup *memcg, unsigned long *events) static void accumulate_memcg_tree(struct mem_cgroup *memcg,
struct accumulated_stats *acc)
{ {
struct mem_cgroup *iter; struct mem_cgroup *mi;
int i; int i;
memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS); for_each_mem_cgroup_tree(mi, memcg) {
for (i = 0; i < acc->stats_size; i++)
acc->stat[i] += memcg_page_state(mi,
acc->stats_array ? acc->stats_array[i] : i);
for_each_mem_cgroup_tree(iter, memcg) { for (i = 0; i < acc->events_size; i++)
for (i = 0; i < NR_VM_EVENT_ITEMS; i++) acc->events[i] += memcg_sum_events(mi,
events[i] += memcg_sum_events(iter, i); acc->events_array ? acc->events_array[i] : i);
for (i = 0; i < NR_LRU_LISTS; i++)
acc->lru_pages[i] +=
mem_cgroup_nr_lru_pages(mi, BIT(i));
} }
} }
...@@ -3332,6 +3337,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) ...@@ -3332,6 +3337,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
unsigned long memory, memsw; unsigned long memory, memsw;
struct mem_cgroup *mi; struct mem_cgroup *mi;
unsigned int i; unsigned int i;
struct accumulated_stats acc;
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
...@@ -3364,32 +3370,27 @@ static int memcg_stat_show(struct seq_file *m, void *v) ...@@ -3364,32 +3370,27 @@ static int memcg_stat_show(struct seq_file *m, void *v)
seq_printf(m, "hierarchical_memsw_limit %llu\n", seq_printf(m, "hierarchical_memsw_limit %llu\n",
(u64)memsw * PAGE_SIZE); (u64)memsw * PAGE_SIZE);
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { memset(&acc, 0, sizeof(acc));
unsigned long long val = 0; acc.stats_size = ARRAY_SIZE(memcg1_stats);
acc.stats_array = memcg1_stats;
acc.events_size = ARRAY_SIZE(memcg1_events);
acc.events_array = memcg1_events;
accumulate_memcg_tree(memcg, &acc);
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue; continue;
for_each_mem_cgroup_tree(mi, memcg) seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
val += memcg_page_state(mi, memcg1_stats[i]) * (u64)acc.stat[i] * PAGE_SIZE);
PAGE_SIZE;
seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val);
} }
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) { for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
unsigned long long val = 0; seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
(u64)acc.events[i]);
for_each_mem_cgroup_tree(mi, memcg)
val += memcg_sum_events(mi, memcg1_events[i]);
seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val);
}
for (i = 0; i < NR_LRU_LISTS; i++) {
unsigned long long val = 0;
for_each_mem_cgroup_tree(mi, memcg) for (i = 0; i < NR_LRU_LISTS; i++)
val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); (u64)acc.lru_pages[i] * PAGE_SIZE);
}
#ifdef CONFIG_DEBUG_VM #ifdef CONFIG_DEBUG_VM
{ {
...@@ -5486,8 +5487,7 @@ static int memory_events_show(struct seq_file *m, void *v) ...@@ -5486,8 +5487,7 @@ static int memory_events_show(struct seq_file *m, void *v)
static int memory_stat_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v)
{ {
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
unsigned long stat[MEMCG_NR_STAT]; struct accumulated_stats acc;
unsigned long events[NR_VM_EVENT_ITEMS];
int i; int i;
/* /*
...@@ -5501,66 +5501,62 @@ static int memory_stat_show(struct seq_file *m, void *v) ...@@ -5501,66 +5501,62 @@ static int memory_stat_show(struct seq_file *m, void *v)
* Current memory state: * Current memory state:
*/ */
tree_stat(memcg, stat); memset(&acc, 0, sizeof(acc));
tree_events(memcg, events); acc.stats_size = MEMCG_NR_STAT;
acc.events_size = NR_VM_EVENT_ITEMS;
accumulate_memcg_tree(memcg, &acc);
seq_printf(m, "anon %llu\n", seq_printf(m, "anon %llu\n",
(u64)stat[MEMCG_RSS] * PAGE_SIZE); (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
seq_printf(m, "file %llu\n", seq_printf(m, "file %llu\n",
(u64)stat[MEMCG_CACHE] * PAGE_SIZE); (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
seq_printf(m, "kernel_stack %llu\n", seq_printf(m, "kernel_stack %llu\n",
(u64)stat[MEMCG_KERNEL_STACK_KB] * 1024); (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
seq_printf(m, "slab %llu\n", seq_printf(m, "slab %llu\n",
(u64)(stat[NR_SLAB_RECLAIMABLE] + (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
seq_printf(m, "sock %llu\n", seq_printf(m, "sock %llu\n",
(u64)stat[MEMCG_SOCK] * PAGE_SIZE); (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
seq_printf(m, "shmem %llu\n", seq_printf(m, "shmem %llu\n",
(u64)stat[NR_SHMEM] * PAGE_SIZE); (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
seq_printf(m, "file_mapped %llu\n", seq_printf(m, "file_mapped %llu\n",
(u64)stat[NR_FILE_MAPPED] * PAGE_SIZE); (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
seq_printf(m, "file_dirty %llu\n", seq_printf(m, "file_dirty %llu\n",
(u64)stat[NR_FILE_DIRTY] * PAGE_SIZE); (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
seq_printf(m, "file_writeback %llu\n", seq_printf(m, "file_writeback %llu\n",
(u64)stat[NR_WRITEBACK] * PAGE_SIZE); (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
for (i = 0; i < NR_LRU_LISTS; i++) { for (i = 0; i < NR_LRU_LISTS; i++)
struct mem_cgroup *mi; seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
unsigned long val = 0; (u64)acc.lru_pages[i] * PAGE_SIZE);
for_each_mem_cgroup_tree(mi, memcg)
val += mem_cgroup_nr_lru_pages(mi, BIT(i));
seq_printf(m, "%s %llu\n",
mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
}
seq_printf(m, "slab_reclaimable %llu\n", seq_printf(m, "slab_reclaimable %llu\n",
(u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
seq_printf(m, "slab_unreclaimable %llu\n", seq_printf(m, "slab_unreclaimable %llu\n",
(u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE); (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
/* Accumulated memory events */ /* Accumulated memory events */
seq_printf(m, "pgfault %lu\n", events[PGFAULT]); seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]); seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
seq_printf(m, "pgrefill %lu\n", events[PGREFILL]); seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] + seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
events[PGSCAN_DIRECT]); acc.events[PGSCAN_DIRECT]);
seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] + seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
events[PGSTEAL_DIRECT]); acc.events[PGSTEAL_DIRECT]);
seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]); seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]); seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]); seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]); seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
seq_printf(m, "workingset_refault %lu\n", seq_printf(m, "workingset_refault %lu\n",
stat[WORKINGSET_REFAULT]); acc.stat[WORKINGSET_REFAULT]);
seq_printf(m, "workingset_activate %lu\n", seq_printf(m, "workingset_activate %lu\n",
stat[WORKINGSET_ACTIVATE]); acc.stat[WORKINGSET_ACTIVATE]);
seq_printf(m, "workingset_nodereclaim %lu\n", seq_printf(m, "workingset_nodereclaim %lu\n",
stat[WORKINGSET_NODERECLAIM]); acc.stat[WORKINGSET_NODERECLAIM]);
return 0; return 0;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册