提交 500d0e5e 编写于 作者: Y Yu Zhao 提交者: YuLinjia

mm: multi-gen LRU: kill switch

mainline inclusion
from mainline-v6.1-rc1
commit 354ed597
category: feature
bugzilla: https://gitee.com/openeuler/open-source-summer/issues/I55Z0L
CVE: NA
Reference: https://android-review.googlesource.com/c/kernel/common/+/2050915/10

----------------------------------------------------------------------

Add /sys/kernel/mm/lru_gen/enabled as a kill switch. Components that
can be disabled include:
  0x0001: the multi-gen LRU core
  0x0002: walking page table, when arch_has_hw_pte_young() returns
          true
  0x0004: clearing the accessed bit in non-leaf PMD entries, when
          CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y
  [yYnN]: apply to all the components above
E.g.,
  echo y >/sys/kernel/mm/lru_gen/enabled
  cat /sys/kernel/mm/lru_gen/enabled
  0x0007
  echo 5 >/sys/kernel/mm/lru_gen/enabled
  cat /sys/kernel/mm/lru_gen/enabled
  0x0005

NB: the page table walks happen on the scale of seconds under heavy
memory pressure, in which case the mmap_lock contention is a lesser
concern, compared with the LRU lock contention and the I/O congestion.
So far the only well-known case of the mmap_lock contention happens on
Android, due to Scudo [1] which allocates several thousand VMAs for
merely a few hundred MBs. The SPF and the Maple Tree also have
provided their own assessments [2][3]. However, if walking page tables
does worsen the mmap_lock contention, the kill switch can be used to
disable it. In this case the multi-gen LRU will suffer a minor
performance degradation, as shown previously.

Clearing the accessed bit in non-leaf PMD entries can also be
disabled, since this behavior was not tested on x86 varieties other
than Intel and AMD.

[1] https://source.android.com/devices/tech/debug/scudo
[2] https://lore.kernel.org/lkml/20220128131006.67712-1-michel@lespinasse.org/
[3] https://lore.kernel.org/lkml/20220202024137.2516438-1-Liam.Howlett@oracle.com/

Link: https://lore.kernel.org/r/20220309021230.721028-11-yuzhao@google.com/Signed-off-by: NYu Zhao <yuzhao@google.com>
Acked-by: NBrian Geffon <bgeffon@google.com>
Acked-by: NJan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: NOleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: NSteven Barrett <steven@liquorix.net>
Acked-by: NSuleiman Souhlal <suleiman@google.com>
Tested-by: NDaniel Byrne <djbyrne@mtu.edu>
Tested-by: NDonald Carr <d@chaos-reins.com>
Tested-by: NHolger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: NKonstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: NShuang Zhai <szhai2@cs.rochester.edu>
Tested-by: NSofia Trinh <sofia.trinh@edi.works>
Tested-by: NVaibhav Jain <vaibhav@linux.ibm.com>
Bug: 227651406
Signed-off-by: NKalesh Singh <kaleshsingh@google.com>
Change-Id: I71801d9470a2588cad8bfd14fbcfafc7b010aa03
Signed-off-by: NYuLinjia <3110442349@qq.com>
上级 46a1f8f1
...@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgroup *cgrp) ...@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgroup *cgrp)
css_put(&cgrp->self); css_put(&cgrp->self);
} }
extern struct mutex cgroup_mutex;
static inline void cgroup_lock(void)
{
mutex_lock(&cgroup_mutex);
}
static inline void cgroup_unlock(void)
{
mutex_unlock(&cgroup_mutex);
}
/** /**
* task_css_set_check - obtain a task's css_set with extra access conditions * task_css_set_check - obtain a task's css_set with extra access conditions
* @task: the task to obtain css_set for * @task: the task to obtain css_set for
...@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgroup *cgrp) ...@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgroup *cgrp)
* as locks used during the cgroup_subsys::attach() methods. * as locks used during the cgroup_subsys::attach() methods.
*/ */
#ifdef CONFIG_PROVE_RCU #ifdef CONFIG_PROVE_RCU
extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock; extern spinlock_t css_set_lock;
#define task_css_set_check(task, __c) \ #define task_css_set_check(task, __c) \
rcu_dereference_check((task)->cgroups, \ rcu_dereference_check((task)->cgroups, \
...@@ -704,6 +715,8 @@ struct cgroup; ...@@ -704,6 +715,8 @@ struct cgroup;
static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
static inline void css_get(struct cgroup_subsys_state *css) {} static inline void css_get(struct cgroup_subsys_state *css) {}
static inline void css_put(struct cgroup_subsys_state *css) {} static inline void css_put(struct cgroup_subsys_state *css) {}
static inline void cgroup_lock(void) {}
static inline void cgroup_unlock(void) {}
static inline int cgroup_attach_task_all(struct task_struct *from, static inline int cgroup_attach_task_all(struct task_struct *from,
struct task_struct *t) { return 0; } struct task_struct *t) { return 0; }
static inline int cgroupstats_build(struct cgroupstats *stats, static inline int cgroupstats_build(struct cgroupstats *stats,
......
...@@ -112,7 +112,15 @@ static __always_inline enum lru_list page_lru(struct page *page) ...@@ -112,7 +112,15 @@ static __always_inline enum lru_list page_lru(struct page *page)
static inline bool lru_gen_enabled(void) static inline bool lru_gen_enabled(void)
{ {
return true; #ifdef CONFIG_LRU_GEN_ENABLED
DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);
return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
#else
DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);
return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
#endif
} }
static inline bool lru_gen_in_fault(void) static inline bool lru_gen_in_fault(void)
...@@ -202,7 +210,7 @@ static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bo ...@@ -202,7 +210,7 @@ static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bo
int zone = page_zonenum(page); int zone = page_zonenum(page);
struct lru_gen_struct *lrugen = &lruvec->lrugen; struct lru_gen_struct *lrugen = &lruvec->lrugen;
if (PageUnevictable(page)) if (PageUnevictable(page) || !lrugen->enabled)
return false; return false;
/* /*
* There are three common cases for this page: * There are three common cases for this page:
......
...@@ -346,6 +346,13 @@ enum { ...@@ -346,6 +346,13 @@ enum {
LRU_GEN_FILE, LRU_GEN_FILE,
}; };
enum {
LRU_GEN_CORE,
LRU_GEN_MM_WALK,
LRU_GEN_NONLEAF_YOUNG,
NR_LRU_GEN_CAPS
};
#define MIN_LRU_BATCH BITS_PER_LONG #define MIN_LRU_BATCH BITS_PER_LONG
#define MAX_LRU_BATCH (MIN_LRU_BATCH * 128) #define MAX_LRU_BATCH (MIN_LRU_BATCH * 128)
...@@ -384,6 +391,8 @@ struct lru_gen_struct { ...@@ -384,6 +391,8 @@ struct lru_gen_struct {
/* can be modified without holding the LRU lock */ /* can be modified without holding the LRU lock */
atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
/* whether the multi-gen LRU is enabled */
bool enabled;
}; };
enum { enum {
......
...@@ -165,7 +165,6 @@ struct cgroup_mgctx { ...@@ -165,7 +165,6 @@ struct cgroup_mgctx {
#define DEFINE_CGROUP_MGCTX(name) \ #define DEFINE_CGROUP_MGCTX(name) \
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock; extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[]; extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots; extern struct list_head cgroup_roots;
......
...@@ -994,6 +994,12 @@ config LRU_GEN ...@@ -994,6 +994,12 @@ config LRU_GEN
help help
A high performance LRU implementation to overcommit memory. A high performance LRU implementation to overcommit memory.
config LRU_GEN_ENABLED
bool "Enable by default"
depends on LRU_GEN
help
This option enables the multi-gen LRU by default.
config LRU_GEN_STATS config LRU_GEN_STATS
bool "Full stats for debugging" bool "Full stats for debugging"
depends on LRU_GEN depends on LRU_GEN
......
...@@ -54,6 +54,7 @@ ...@@ -54,6 +54,7 @@
#include <linux/psi.h> #include <linux/psi.h>
#include <linux/pagewalk.h> #include <linux/pagewalk.h>
#include <linux/shmem_fs.h> #include <linux/shmem_fs.h>
#include <linux/ctype.h>
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
#include <asm/div64.h> #include <asm/div64.h>
...@@ -2763,6 +2764,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, ...@@ -2763,6 +2764,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
#ifdef CONFIG_LRU_GEN #ifdef CONFIG_LRU_GEN
#ifdef CONFIG_LRU_GEN_ENABLED
DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
#else
DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
#endif
/****************************************************************************** /******************************************************************************
* shorthand helpers * shorthand helpers
******************************************************************************/ ******************************************************************************/
...@@ -2799,6 +2806,15 @@ static int page_lru_tier(struct page *page) ...@@ -2799,6 +2806,15 @@ static int page_lru_tier(struct page *page)
return lru_tier_from_refs(refs); return lru_tier_from_refs(refs);
} }
static bool get_cap(int cap)
{
#ifdef CONFIG_LRU_GEN_ENABLED
return static_branch_likely(&lru_gen_caps[cap]);
#else
return static_branch_unlikely(&lru_gen_caps[cap]);
#endif
}
static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
{ {
struct pglist_data *pgdat = NODE_DATA(nid); struct pglist_data *pgdat = NODE_DATA(nid);
...@@ -3604,7 +3620,8 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area ...@@ -3604,7 +3620,8 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
goto next; goto next;
if (!pmd_trans_huge(pmd[i])) { if (!pmd_trans_huge(pmd[i])) {
if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)) if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
get_cap(LRU_GEN_NONLEAF_YOUNG))
pmdp_test_and_clear_young(vma, addr, pmd + i); pmdp_test_and_clear_young(vma, addr, pmd + i);
goto next; goto next;
} }
...@@ -3711,10 +3728,12 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, ...@@ -3711,10 +3728,12 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
priv->mm_stats[MM_PMD_TOTAL]++; priv->mm_stats[MM_PMD_TOTAL]++;
#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG #ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
if (!pmd_young(val)) if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
continue; if (!pmd_young(val))
continue;
walk_pmd_range_locked(pud, addr, vma, walk, &pos); walk_pmd_range_locked(pud, addr, vma, walk, &pos);
}
#endif #endif
if (!priv->full_scan && !test_bloom_filter(priv->lruvec, priv->max_seq, pmd + i)) if (!priv->full_scan && !test_bloom_filter(priv->lruvec, priv->max_seq, pmd + i))
continue; continue;
...@@ -3955,7 +3974,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, ...@@ -3955,7 +3974,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
* handful of PTEs. Spreading the work out over a period of time usually * handful of PTEs. Spreading the work out over a period of time usually
* is less efficient, but it avoids bursty page faults. * is less efficient, but it avoids bursty page faults.
*/ */
if (!full_scan && !arch_has_hw_pte_young()) { if (!full_scan && (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK))) {
success = iterate_mm_list_nowalk(lruvec, max_seq); success = iterate_mm_list_nowalk(lruvec, max_seq);
goto done; goto done;
} }
...@@ -4680,6 +4699,223 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc ...@@ -4680,6 +4699,223 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
blk_finish_plug(&plug); blk_finish_plug(&plug);
} }
/******************************************************************************
* state change
******************************************************************************/
static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
{
struct lru_gen_struct *lrugen = &lruvec->lrugen;
if (lrugen->enabled) {
enum lru_list lru;
for_each_evictable_lru(lru) {
if (!list_empty(&lruvec->lists[lru]))
return false;
}
} else {
int gen, type, zone;
for_each_gen_type_zone(gen, type, zone) {
if (!list_empty(&lrugen->lists[gen][type][zone]))
return false;
/* unlikely but not a bug when reset_batch_size() is pending */
VM_WARN_ON(lrugen->nr_pages[gen][type][zone]);
}
}
return true;
}
static bool fill_evictable(struct lruvec *lruvec)
{
enum lru_list lru;
int remaining = MAX_LRU_BATCH;
for_each_evictable_lru(lru) {
int type = is_file_lru(lru);
bool active = is_active_lru(lru);
struct list_head *head = &lruvec->lists[lru];
while (!list_empty(head)) {
bool success;
struct page *page = lru_to_page(head);
VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(PageUnevictable(page), page);
VM_BUG_ON_PAGE(PageActive(page) != active, page);
VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
VM_BUG_ON_PAGE(page_lru_gen(page) < MAX_NR_GENS, page);
prefetchw_prev_lru_page(page, head, flags);
del_page_from_lru_list(page, lruvec, page_lru(page));
success = lru_gen_add_page(lruvec, page, false);
VM_BUG_ON(!success);
if (!--remaining)
return false;
}
}
return true;
}
static bool drain_evictable(struct lruvec *lruvec)
{
int gen, type, zone;
int remaining = MAX_LRU_BATCH;
for_each_gen_type_zone(gen, type, zone) {
struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
while (!list_empty(head)) {
bool success;
struct page *page = lru_to_page(head);
VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(PageUnevictable(page), page);
VM_BUG_ON_PAGE(PageActive(page), page);
VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
prefetchw_prev_lru_page(page, head, flags);
success = lru_gen_del_page(lruvec, page, false);
VM_BUG_ON(!success);
add_page_to_lru_list(page, lruvec, page_lru(page));
if (!--remaining)
return false;
}
}
return true;
}
static void lru_gen_change_state(bool enable)
{
static DEFINE_MUTEX(state_mutex);
struct mem_cgroup *memcg;
cgroup_lock();
cpus_read_lock();
get_online_mems();
mutex_lock(&state_mutex);
if (enable == lru_gen_enabled())
goto unlock;
if (enable)
static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
else
static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
int nid;
for_each_node(nid) {
struct pglist_data *pgdat = NODE_DATA(nid);
struct lruvec *lruvec = get_lruvec(memcg, nid);
if (!lruvec)
continue;
if (!pgdat) {
lruvec->lrugen.enabled = enable;
continue;
}
spin_lock_irq(&lruvec->lru_lock);
VM_BUG_ON(!seq_is_valid(lruvec));
VM_BUG_ON(!state_is_valid(lruvec));
lruvec->lrugen.enabled = enable;
while (!(enable ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
spin_unlock_irq(&lruvec->lru_lock);
cond_resched();
spin_lock_irq(&lruvec->lru_lock);
}
spin_unlock_irq(&lruvec->lru_lock);
}
cond_resched();
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
unlock:
mutex_unlock(&state_mutex);
put_online_mems();
cpus_read_unlock();
cgroup_unlock();
}
/******************************************************************************
* sysfs interface
******************************************************************************/
static ssize_t show_enable(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
unsigned int caps = 0;
if (get_cap(LRU_GEN_CORE))
caps |= BIT(LRU_GEN_CORE);
if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
caps |= BIT(LRU_GEN_MM_WALK);
if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
}
static ssize_t store_enable(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t len)
{
int i;
unsigned int caps;
if (tolower(*buf) == 'n')
caps = 0;
else if (tolower(*buf) == 'y')
caps = -1;
else if (kstrtouint(buf, 0, &caps))
return -EINVAL;
for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
bool enable = caps & BIT(i);
if (i == LRU_GEN_CORE)
lru_gen_change_state(enable);
else if (enable)
static_branch_enable(&lru_gen_caps[i]);
else
static_branch_disable(&lru_gen_caps[i]);
}
return len;
}
static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
enabled, 0644, show_enable, store_enable
);
static struct attribute *lru_gen_attrs[] = {
&lru_gen_enabled_attr.attr,
NULL
};
static struct attribute_group lru_gen_attr_group = {
.name = "lru_gen",
.attrs = lru_gen_attrs,
};
/****************************************************************************** /******************************************************************************
* initialization * initialization
******************************************************************************/ ******************************************************************************/
...@@ -4690,6 +4926,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) ...@@ -4690,6 +4926,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
struct lru_gen_struct *lrugen = &lruvec->lrugen; struct lru_gen_struct *lrugen = &lruvec->lrugen;
lrugen->max_seq = MIN_NR_GENS + 1; lrugen->max_seq = MIN_NR_GENS + 1;
lrugen->enabled = lru_gen_enabled();
for_each_gen_type_zone(gen, type, zone) for_each_gen_type_zone(gen, type, zone)
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
...@@ -4730,6 +4967,9 @@ static int __init init_lru_gen(void) ...@@ -4730,6 +4967,9 @@ static int __init init_lru_gen(void)
BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1); BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
pr_err("lru_gen: failed to create sysfs group\n");
return 0; return 0;
}; };
late_initcall(init_lru_gen); late_initcall(init_lru_gen);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册