未验证 提交 7602779e 编写于 作者: O openeuler-ci-bot 提交者: Gitee

!1228 Per-memcg swap control

Merge Pull Request from: @ci-robot 
 
PR sync from: Liu Shixin <liushixin2@huawei.com>
https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/LB3KEGYTHZN2JVMAZADIFG73PYZUD2RV/ 
This series patches add swap control for memory cgroup. Patch[2] add page
type in memory.reclaim interface to support reclaim anon pages. Patch[4]
add memory.force_swapin interface to support swap back pages proactively.
Patch[5] add memory.swap.max interface to limit usage of swap for memory
cgroup. Patch[6-7] add memory.swapfile interface to limit available swap
device for memory cgroup.

v2->v3: Enable memcg swap qos for x86_64 and arm64 by default.
v1->v2: Rebase on the latest version and fix merge conflicts.

Liu Shixin (7):
  memcg: add page type to memory.reclaim interface
  memcg: introduce memcg swap qos feature
  memcg: introduce per-memcg swapin interface
  memcg: add restrict to swap to cgroup1
  mm/swapfile: introduce per-memcg swapfile control
  mm: swap_slots: add per-type slot cache
  config: enable memcg swap qos for x86_64 and arm64 by default

Yosry Ahmed (1):
  mm: vmpressure: don't count proactive reclaim in vmpressure


-- 
2.25.1
 
 
Link:https://gitee.com/openeuler/kernel/pulls/1228 

Reviewed-by: Jialin Zhang <zhangjialin11@huawei.com> 
Signed-off-by: Jialin Zhang <zhangjialin11@huawei.com> 
......@@ -78,6 +78,9 @@ Brief summary of control files.
memory.stat show various statistics
memory.use_hierarchy set/show hierarchical account enabled
memory.force_empty trigger forced page reclaim
memory.force_swapin trigger forced swapin anon page
memory.swap.max set/show limit for swap
memory.swapfile set/show available swap file
memory.pressure_level set memory pressure notifications
memory.swappiness set/show swappiness parameter of vmscan
(See sysctl's vm.swappiness)
......
......@@ -1196,20 +1196,28 @@ PAGE_SIZE multiple when read back.
target cgroup.
This file accepts a single key, the number of bytes to reclaim.
No nested keys are currently supported.
Example::
echo "1G" > memory.reclaim
The interface can be later extended with nested keys to
configure the reclaim behavior. For example, specify the
type of memory to reclaim from (anon, file, ..).
This file also accepts nested keys, the number of bytes to reclaim
with the type of memory to reclaim.
Example::
echo "1G type=file" > memory.reclaim
Please note that the kernel can over or under reclaim from
the target cgroup. If less bytes are reclaimed than the
specified amount, -EAGAIN is returned.
Please note that the proactive reclaim (triggered by this
interface) is not meant to indicate memory pressure on the
memory cgroup. Therefore socket memory balancing triggered by
the memory reclaim normally is not exercised in this case.
This means that the networking layer will not adapt based on
reclaim induced by memory.reclaim.
memory.oom.group
A read-write single value file which exists on non-root
cgroups. The default value is "0".
......
......@@ -1092,6 +1092,7 @@ CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y
CONFIG_CLEANCACHE=y
CONFIG_FRONTSWAP=y
CONFIG_MEMCG_QOS=y
CONFIG_MEMCG_SWAP_QOS=y
CONFIG_ETMEM_SCAN=m
CONFIG_ETMEM_SWAP=m
CONFIG_ETMEM=y
......
......@@ -1042,6 +1042,7 @@ CONFIG_THP_SWAP=y
CONFIG_CLEANCACHE=y
CONFIG_FRONTSWAP=y
CONFIG_MEMCG_QOS=y
CONFIG_MEMCG_SWAP_QOS=y
CONFIG_ETMEM_SCAN=m
CONFIG_ETMEM_SWAP=m
CONFIG_ETMEM=y
......
......@@ -50,6 +50,11 @@ enum memcg_memory_event {
MEMCG_NR_MEMORY_EVENTS,
};
enum {
SWAP_TYPE_ALL = -1, /* allowd use all swap file */
SWAP_TYPE_NONE = -2, /* prohibited use any swapfile */
};
struct mem_cgroup_reclaim_cookie {
pg_data_t *pgdat;
unsigned int generation;
......@@ -240,6 +245,11 @@ struct obj_cgroup {
};
};
struct swap_device {
unsigned long max;
int type;
};
/*
* The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide
......@@ -402,7 +412,12 @@ struct mem_cgroup {
#else
KABI_RESERVE(6)
#endif
#ifdef CONFIG_MEMCG_SWAP_QOS
/* per-memcg swap device control; protected by swap_lock */
KABI_USE(7, struct swap_device *swap_dev)
#else
KABI_RESERVE(7)
#endif
KABI_RESERVE(8)
struct mem_cgroup_per_node *nodeinfo[0];
......@@ -424,6 +439,10 @@ extern int sysctl_memcg_qos_handler(struct ctl_table *table,
void memcg_print_bad_task(struct oom_control *oc);
#endif
#ifdef CONFIG_MEMCG_SWAP_QOS
DECLARE_STATIC_KEY_FALSE(memcg_swap_qos_key);
#endif
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
* TODO: maybe necessary to use big numbers in big irons.
......@@ -1294,6 +1313,9 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
int mem_cgroup_force_empty(struct mem_cgroup *memcg);
int memcg_get_swap_type(struct page *page);
void memcg_remove_swapfile(int type);
#else /* CONFIG_MEMCG */
#define MEM_CGROUP_ID_SHIFT 0
......@@ -1701,6 +1723,15 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
static inline void memcg_print_bad_task(struct oom_control *oc)
{
}
static inline int memcg_get_swap_type(struct page *page)
{
return SWAP_TYPE_ALL;
}
static inline void memcg_remove_swapfile(int type)
{
}
#endif /* CONFIG_MEMCG */
/* idx can be of type enum memcg_stat_item or node_stat_item */
......
......@@ -2650,6 +2650,7 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf, bool downgrade);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf);
extern void force_swapin_vma(struct vm_area_struct *vma);
extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);
extern unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file,
......
......@@ -376,10 +376,14 @@ extern unsigned long zone_reclaimable_pages(struct zone *zone);
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *mask);
extern int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode);
#define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
#define MEMCG_RECLAIM_PROACTIVE (1 << 2)
#define MEMCG_RECLAIM_NOT_FILE (1 << 3)
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages,
gfp_t gfp_mask,
bool may_swap);
unsigned int reclaim_options);
extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
pg_data_t *pgdat,
......@@ -507,11 +511,14 @@ static inline long get_nr_swap_pages(void)
return atomic_long_read(&nr_swap_pages);
}
extern long get_nr_swap_pages_type(int type);
extern void si_swapinfo(struct sysinfo *);
extern swp_entry_t get_swap_page(struct page *page);
extern void put_swap_page(struct page *page, swp_entry_t entry);
extern swp_entry_t get_swap_page_of_type(int);
extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size);
extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size,
int type);
extern int add_swap_count_continuation(swp_entry_t, gfp_t);
extern void swap_shmem_alloc(swp_entry_t);
extern int swap_duplicate(swp_entry_t);
......@@ -543,6 +550,12 @@ static inline void put_swap_device(struct swap_info_struct *si)
percpu_ref_put(&si->sei->users);
}
#ifdef CONFIG_MEMCG_SWAP_QOS
extern int write_swapfile_for_memcg(struct address_space *mapping,
int *swap_type);
extern void read_swapfile_for_memcg(struct seq_file *m, int type);
void enable_swap_slots_cache_max(void);
#endif
#else /* CONFIG_SWAP */
static inline int swap_readpage(struct page *page, bool do_poll)
......
......@@ -23,7 +23,7 @@ struct swap_slots_cache {
void disable_swap_slots_cache_lock(void);
void reenable_swap_slots_cache_unlock(void);
void enable_swap_slots_cache(void);
void enable_swap_slots_cache(int type);
int free_swap_slot(swp_entry_t entry);
extern bool swap_slot_cache_enabled;
......
......@@ -512,6 +512,15 @@ config MEMCG_QOS
If unsure, say "n".
config MEMCG_SWAP_QOS
bool "Enable Memory Cgroup Swap Control"
depends on MEMCG_SWAP
depends on X86 || ARM64
default n
help
memcg swap control include memory force swapin, swapfile control
and swap limit.
config ETMEM_SCAN
tristate "module: etmem page scan for etmem support"
depends on ETMEM
......
......@@ -259,6 +259,25 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
lru_add_drain(); /* Push any new pages onto the LRU now */
}
void force_swapin_vma(struct vm_area_struct *vma)
{
struct file *file = vma->vm_file;
if (!can_madv_lru_vma(vma))
return;
if (!file) {
walk_page_vma(vma, &swapin_walk_ops, vma);
lru_add_drain();
} else if (shmem_mapping(file->f_mapping))
force_shm_swapin_readahead(vma, vma->vm_start,
vma->vm_end, file->f_mapping);
}
#else
void force_swapin_vma(struct vm_area_struct *vma)
{
}
#endif /* CONFIG_SWAP */
/*
......
......@@ -2397,7 +2397,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
psi_memstall_enter(&pflags);
nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
gfp_mask, true);
gfp_mask,
MEMCG_RECLAIM_MAY_SWAP);
psi_memstall_leave(&pflags);
} while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg));
......@@ -2660,7 +2661,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
enum oom_status oom_status;
unsigned long nr_reclaimed;
bool passed_oom = false;
bool may_swap = true;
unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
bool drained = false;
unsigned long pflags;
......@@ -2679,7 +2680,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
mem_over_limit = mem_cgroup_from_counter(counter, memory);
} else {
mem_over_limit = mem_cgroup_from_counter(counter, memsw);
may_swap = false;
reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
}
if (batch > nr_pages) {
......@@ -2715,7 +2716,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
psi_memstall_enter(&pflags);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
gfp_mask, may_swap);
gfp_mask, reclaim_options);
psi_memstall_leave(&pflags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
......@@ -3365,8 +3366,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
continue;
}
if (!try_to_free_mem_cgroup_pages(memcg, 1,
GFP_KERNEL, !memsw)) {
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
ret = -EBUSY;
break;
}
......@@ -3483,7 +3484,7 @@ int mem_cgroup_force_empty(struct mem_cgroup *memcg)
return -EINTR;
progress = try_to_free_mem_cgroup_pages(memcg, 1,
GFP_KERNEL, true);
GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
if (!progress) {
nr_retries--;
/* maybe some writeback is necessary */
......@@ -4054,6 +4055,344 @@ void memcg_print_bad_task(struct oom_control *oc)
#endif
#ifdef CONFIG_MEMCG_SWAP_QOS
DEFINE_STATIC_KEY_FALSE(memcg_swap_qos_key);
#ifdef CONFIG_SYSCTL
static int sysctl_memcg_swap_qos_stat;
static void memcg_swap_qos_reset(void)
{
struct mem_cgroup *memcg;
for_each_mem_cgroup(memcg) {
WRITE_ONCE(memcg->swap_dev->max, PAGE_COUNTER_MAX);
WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_ALL);
}
}
static int sysctl_memcg_swap_qos_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
int ret;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (ret)
return ret;
if (write) {
if (sysctl_memcg_swap_qos_stat) {
memcg_swap_qos_reset();
static_branch_enable(&memcg_swap_qos_key);
enable_swap_slots_cache_max();
} else {
static_branch_disable(&memcg_swap_qos_key);
}
}
return 0;
}
static struct ctl_table memcg_swap_qos_sysctls[] = {
{
.procname = "memcg_swap_qos_enable",
.data = &sysctl_memcg_swap_qos_stat,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = sysctl_memcg_swap_qos_handler,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
{ }
};
static __init int memcg_swap_qos_sysctls_init(void)
{
if (mem_cgroup_disabled() || cgroup_memory_noswap)
return 0;
register_sysctl_init("vm", memcg_swap_qos_sysctls);
return 0;
}
late_initcall(memcg_swap_qos_sysctls_init);
#endif
static int mem_cgroup_task_swapin(struct task_struct *task, void *arg)
{
struct mm_struct *mm = task->mm;
struct vm_area_struct *vma;
struct blk_plug plug;
mmap_read_lock(mm);
blk_start_plug(&plug);
for (vma = mm->mmap; vma; vma = vma->vm_next)
force_swapin_vma(vma);
blk_finish_plug(&plug);
mmap_read_unlock(mm);
return 0;
}
static ssize_t memory_swapin(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
mem_cgroup_scan_tasks(memcg, mem_cgroup_task_swapin, NULL);
return nbytes;
}
static int memcg_alloc_swap_device(struct mem_cgroup *memcg)
{
memcg->swap_dev = kmalloc(sizeof(struct swap_device), GFP_KERNEL);
if (!memcg->swap_dev)
return -ENOMEM;
return 0;
}
static void memcg_free_swap_device(struct mem_cgroup *memcg)
{
if (!memcg->swap_dev)
return;
kfree(memcg->swap_dev);
memcg->swap_dev = NULL;
}
static void memcg_swap_device_init(struct mem_cgroup *memcg,
struct mem_cgroup *parent)
{
if (!static_branch_likely(&memcg_swap_qos_key) || !parent) {
WRITE_ONCE(memcg->swap_dev->max, PAGE_COUNTER_MAX);
WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_ALL);
} else {
WRITE_ONCE(memcg->swap_dev->max,
READ_ONCE(parent->swap_dev->max));
WRITE_ONCE(memcg->swap_dev->type,
READ_ONCE(parent->swap_dev->type));
}
}
u64 memcg_swapmax_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
if (!static_branch_likely(&memcg_swap_qos_key))
return PAGE_COUNTER_MAX * PAGE_SIZE;
return READ_ONCE(memcg->swap_dev->max) * PAGE_SIZE;
}
static ssize_t memcg_swapmax_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned long max;
int err;
if (!static_branch_likely(&memcg_swap_qos_key))
return -EACCES;
buf = strstrip(buf);
err = page_counter_memparse(buf, "max", &max);
if (err)
return err;
WRITE_ONCE(memcg->swap_dev->max, max);
return nbytes;
}
static int mem_cgroup_check_swap_for_v1(struct page *page, swp_entry_t entry)
{
struct mem_cgroup *memcg, *target_memcg;
unsigned long swap_usage;
unsigned long swap_limit;
long nr_swap_pages = PAGE_COUNTER_MAX;
if (!static_branch_likely(&memcg_swap_qos_key))
return 0;
if (!entry.val)
return 0;
rcu_read_lock();
target_memcg = page_memcg(page);
if (!target_memcg || mem_cgroup_is_root(target_memcg)) {
rcu_read_unlock();
return 0;
}
if (!css_tryget_online(&target_memcg->css)) {
rcu_read_unlock();
return 0;
}
rcu_read_unlock();
for (memcg = target_memcg; memcg != root_mem_cgroup;
memcg = parent_mem_cgroup(memcg)) {
swap_limit = READ_ONCE(memcg->swap_dev->max);
swap_usage = page_counter_read(&memcg->memsw) -
page_counter_read(&memcg->memory);
nr_swap_pages = min_t(long, nr_swap_pages,
swap_limit - swap_usage);
}
css_put(&target_memcg->css);
if (thp_nr_pages(page) > nr_swap_pages)
return -ENOMEM;
return 0;
}
static int memcg_swapfile_read(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
int type;
if (!static_branch_likely(&memcg_swap_qos_key)) {
seq_printf(m, "all\n");
return 0;
}
type = READ_ONCE(memcg->swap_dev->type);
if (type == SWAP_TYPE_NONE)
seq_printf(m, "none\n");
else if (type == SWAP_TYPE_ALL)
seq_printf(m, "all\n");
else
read_swapfile_for_memcg(m, type);
return 0;
}
static ssize_t memcg_swapfile_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
struct filename *pathname;
struct file *swapfile;
int ret;
if (!static_branch_likely(&memcg_swap_qos_key))
return -EACCES;
buf = strstrip(buf);
if (!strcmp(buf, "none")) {
WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_NONE);
return nbytes;
} else if (!strcmp(buf, "all")) {
WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_ALL);
return nbytes;
}
pathname = getname_kernel(buf);
if (IS_ERR(pathname))
return PTR_ERR(pathname);
swapfile = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
if (IS_ERR(swapfile)) {
putname(pathname);
return PTR_ERR(swapfile);
}
ret = write_swapfile_for_memcg(swapfile->f_mapping,
&memcg->swap_dev->type);
filp_close(swapfile, NULL);
putname(pathname);
return ret < 0 ? ret : nbytes;
}
int memcg_get_swap_type(struct page *page)
{
struct mem_cgroup *memcg;
int type;
if (!static_branch_likely(&memcg_swap_qos_key))
return SWAP_TYPE_ALL;
if (!page)
return SWAP_TYPE_ALL;
rcu_read_lock();
memcg = page_memcg(page);
if (!memcg || mem_cgroup_is_root(memcg)) {
rcu_read_unlock();
return SWAP_TYPE_ALL;
}
if (!css_tryget_online(&memcg->css)) {
rcu_read_unlock();
return SWAP_TYPE_ALL;
}
rcu_read_unlock();
type = READ_ONCE(memcg->swap_dev->type);
css_put(&memcg->css);
return type;
}
void memcg_remove_swapfile(int type)
{
struct mem_cgroup *memcg;
if (!static_branch_likely(&memcg_swap_qos_key))
return;
for_each_mem_cgroup(memcg)
if (READ_ONCE(memcg->swap_dev->type) == type)
WRITE_ONCE(memcg->swap_dev->type, SWAP_TYPE_NONE);
}
static long mem_cgroup_get_nr_swap_pages_type(struct mem_cgroup *memcg)
{
int type;
if (!static_branch_likely(&memcg_swap_qos_key))
return mem_cgroup_get_nr_swap_pages(memcg);
type = READ_ONCE(memcg->swap_dev->type);
if (type == SWAP_TYPE_ALL)
return mem_cgroup_get_nr_swap_pages(memcg);
else if (type == SWAP_TYPE_NONE)
return 0;
else
return get_nr_swap_pages_type(type);
}
#else
static int memcg_alloc_swap_device(struct mem_cgroup *memcg)
{
return 0;
}
static void memcg_free_swap_device(struct mem_cgroup *memcg)
{
}
static void memcg_swap_device_init(struct mem_cgroup *memcg,
struct mem_cgroup *parent)
{
}
static int mem_cgroup_check_swap_for_v1(struct page *page, swp_entry_t entry)
{
return 0;
}
int memcg_get_swap_type(struct page *page)
{
return SWAP_TYPE_ALL;
}
void memcg_remove_swapfile(int type)
{
}
static long mem_cgroup_get_nr_swap_pages_type(struct mem_cgroup *memcg)
{
return mem_cgroup_get_nr_swap_pages(memcg);
}
#endif
#ifdef CONFIG_NUMA
#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
......@@ -5230,7 +5569,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
}
reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
GFP_KERNEL, true);
GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
if (!reclaimed && !nr_retries--)
break;
......@@ -5265,16 +5604,47 @@ static int memcg_events_local_show(struct seq_file *m, void *v)
return 0;
}
static int reclaim_param_parse(char *buf, unsigned long *nr_pages,
unsigned int *reclaim_options)
{
char *endp;
u64 bytes;
if (!strcmp(buf, "")) {
*nr_pages = PAGE_COUNTER_MAX;
return 0;
}
bytes = memparse(buf, &endp);
if (*endp == ' ') {
buf = endp + 1;
buf = strim(buf);
if (!strcmp(buf, "type=anon"))
*reclaim_options |= MEMCG_RECLAIM_NOT_FILE;
else if (!strcmp(buf, "type=file"))
*reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
else
return -EINVAL;
} else if (*endp != '\0')
return -EINVAL;
*nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);
return 0;
}
static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned int nr_retries = MAX_RECLAIM_RETRIES;
unsigned long nr_to_reclaim, nr_reclaimed = 0;
unsigned int reclaim_options;
int err;
reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
buf = strstrip(buf);
err = page_counter_memparse(buf, "", &nr_to_reclaim);
err = reclaim_param_parse(buf, &nr_to_reclaim, &reclaim_options);
if (err)
return err;
......@@ -5288,6 +5658,11 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
if (signal_pending(current))
return -EINTR;
/* If only reclaim swap pages, check swap space at first. */
if ((reclaim_options & MEMCG_RECLAIM_NOT_FILE) &&
(mem_cgroup_get_nr_swap_pages_type(memcg) <= 0))
return -EAGAIN;
/* This is the final attempt, drain percpu lru caches in the
* hope of introducing more evictable pages for
* try_to_free_mem_cgroup_pages().
......@@ -5297,7 +5672,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
reclaimed = try_to_free_mem_cgroup_pages(memcg,
nr_to_reclaim - nr_reclaimed,
GFP_KERNEL, true);
GFP_KERNEL, reclaim_options);
if (!reclaimed && !nr_retries--)
return -EAGAIN;
......@@ -5710,6 +6085,25 @@ static struct cftype mem_cgroup_legacy_files[] = {
.name = "reclaim",
.write = memory_reclaim,
},
#ifdef CONFIG_MEMCG_SWAP_QOS
{
.name = "force_swapin",
.flags = CFTYPE_NOT_ON_ROOT,
.write = memory_swapin,
},
{
.name = "swap.max",
.flags = CFTYPE_NOT_ON_ROOT,
.write = memcg_swapmax_write,
.read_u64 = memcg_swapmax_read,
},
{
.name = "swapfile",
.flags = CFTYPE_NOT_ON_ROOT,
.write = memcg_swapfile_write,
.seq_show = memcg_swapfile_read,
},
#endif
{
.name = "high_async_ratio",
.flags = CFTYPE_NOT_ON_ROOT,
......@@ -5854,6 +6248,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->vmstats_percpu);
memcg_free_swap_device(memcg);
kfree(memcg);
}
......@@ -5878,6 +6273,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
if (!memcg)
return ERR_PTR(error);
if (memcg_alloc_swap_device(memcg))
goto fail;
memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
1, MEM_CGROUP_ID_MAX,
GFP_KERNEL);
......@@ -5955,17 +6353,20 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
page_counter_init(&memcg->swap, NULL);
page_counter_init(&memcg->kmem, NULL);
page_counter_init(&memcg->tcpmem, NULL);
memcg_swap_device_init(memcg, NULL);
} else if (parent->use_hierarchy) {
memcg->use_hierarchy = true;
page_counter_init(&memcg->memory, &parent->memory);
page_counter_init(&memcg->swap, &parent->swap);
page_counter_init(&memcg->kmem, &parent->kmem);
page_counter_init(&memcg->tcpmem, &parent->tcpmem);
memcg_swap_device_init(memcg, parent);
} else {
page_counter_init(&memcg->memory, &root_mem_cgroup->memory);
page_counter_init(&memcg->swap, &root_mem_cgroup->swap);
page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem);
memcg_swap_device_init(memcg, root_mem_cgroup);
/*
* Deeper hierachy with use_hierarchy == false doesn't make
* much sense so let cgroup subsystem know about this
......@@ -6984,7 +7385,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (nr_reclaims) {
if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
GFP_KERNEL, true))
GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
nr_reclaims--;
continue;
}
......@@ -7899,7 +8300,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
unsigned short oldid;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return 0;
return mem_cgroup_check_swap_for_v1(page, entry);
memcg = page_memcg(page);
......
......@@ -35,6 +35,11 @@
#include <linux/mm.h>
static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
#ifdef CONFIG_MEMCG_SWAP_QOS
static unsigned int nr_swap_slots;
static unsigned int max_swap_slots;
static DEFINE_PER_CPU(struct swap_slots_cache [MAX_SWAPFILES], swp_type_slots);
#endif
static bool swap_slot_cache_active;
bool swap_slot_cache_enabled;
static bool swap_slot_cache_initialized;
......@@ -111,7 +116,37 @@ static bool check_cache_active(void)
return swap_slot_cache_active;
}
static int alloc_swap_slot_cache(unsigned int cpu)
#ifdef CONFIG_MEMCG_SWAP_QOS
static inline struct swap_slots_cache *get_slots_cache(int swap_type)
{
if (swap_type == SWAP_TYPE_ALL)
return raw_cpu_ptr(&swp_slots);
else
return raw_cpu_ptr(&swp_type_slots)[swap_type];
}
static inline struct swap_slots_cache *get_slots_cache_cpu(unsigned int cpu,
int swap_type)
{
if (swap_type == SWAP_TYPE_ALL)
return &per_cpu(swp_slots, cpu);
else
return &per_cpu(swp_type_slots, cpu)[swap_type];
}
#else
static inline struct swap_slots_cache *get_slots_cache(int swap_type)
{
return raw_cpu_ptr(&swp_slots);
}
static inline struct swap_slots_cache *get_slots_cache_cpu(unsigned int cpu,
int swap_type)
{
return &per_cpu(swp_slots, cpu);
}
#endif
static int alloc_swap_slot_cache_cpu_type(unsigned int cpu, int swap_type)
{
struct swap_slots_cache *cache;
swp_entry_t *slots, *slots_ret;
......@@ -134,7 +169,7 @@ static int alloc_swap_slot_cache(unsigned int cpu)
}
mutex_lock(&swap_slots_cache_mutex);
cache = &per_cpu(swp_slots, cpu);
cache = get_slots_cache_cpu(cpu, swap_type);
if (cache->slots || cache->slots_ret) {
/* cache already allocated */
mutex_unlock(&swap_slots_cache_mutex);
......@@ -166,13 +201,74 @@ static int alloc_swap_slot_cache(unsigned int cpu)
return 0;
}
static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type,
bool free_slots)
#ifdef CONFIG_MEMCG_SWAP_QOS
static int __alloc_swap_slot_cache_cpu(unsigned int cpu)
{
int i, ret;
ret = alloc_swap_slot_cache_cpu_type(cpu, SWAP_TYPE_ALL);
if (ret)
return ret;
for (i = 0; i < nr_swap_slots; i++) {
ret = alloc_swap_slot_cache_cpu_type(cpu, i);
if (ret)
return ret;
}
return ret;
}
static void alloc_swap_slot_cache_type(int type)
{
unsigned int cpu;
if (type >= max_swap_slots)
max_swap_slots = type + 1;
if (!static_branch_likely(&memcg_swap_qos_key))
return;
/* serialize with cpu hotplug operations */
get_online_cpus();
while (type >= nr_swap_slots) {
for_each_online_cpu(cpu)
alloc_swap_slot_cache_cpu_type(cpu, nr_swap_slots);
nr_swap_slots++;
}
put_online_cpus();
}
void enable_swap_slots_cache_max(void)
{
mutex_lock(&swap_slots_cache_enable_mutex);
if (max_swap_slots)
alloc_swap_slot_cache_type(max_swap_slots - 1);
mutex_unlock(&swap_slots_cache_enable_mutex);
}
#else
static inline int __alloc_swap_slot_cache_cpu(unsigned int cpu)
{
return alloc_swap_slot_cache_cpu_type(cpu, SWAP_TYPE_ALL);
}
static void alloc_swap_slot_cache_type(int type)
{
}
#endif
static int alloc_swap_slot_cache(unsigned int cpu)
{
return __alloc_swap_slot_cache_cpu(cpu);
}
static void drain_slots_cache_cpu_type(unsigned int cpu, unsigned int type,
bool free_slots, int swap_type)
{
struct swap_slots_cache *cache;
swp_entry_t *slots = NULL;
cache = &per_cpu(swp_slots, cpu);
cache = get_slots_cache_cpu(cpu, swap_type);
if ((type & SLOTS_CACHE) && cache->slots) {
mutex_lock(&cache->alloc_lock);
swapcache_free_entries(cache->slots + cache->cur, cache->nr);
......@@ -198,6 +294,30 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type,
}
}
#ifdef CONFIG_MEMCG_SWAP_QOS
static void __drain_slots_cache_cpu(unsigned int cpu, unsigned int type,
bool free_slots)
{
int i;
drain_slots_cache_cpu_type(cpu, type, free_slots, SWAP_TYPE_ALL);
for (i = 0; i < nr_swap_slots; i++)
drain_slots_cache_cpu_type(cpu, type, free_slots, i);
}
#else
static inline void __drain_slots_cache_cpu(unsigned int cpu,
unsigned int type, bool free_slots)
{
drain_slots_cache_cpu_type(cpu, type, free_slots, SWAP_TYPE_ALL);
}
#endif
static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type,
bool free_slots)
{
__drain_slots_cache_cpu(cpu, type, free_slots);
}
static void __drain_swap_slots_cache(unsigned int type)
{
unsigned int cpu;
......@@ -237,7 +357,7 @@ static int free_slot_cache(unsigned int cpu)
return 0;
}
void enable_swap_slots_cache(void)
void enable_swap_slots_cache(int type)
{
mutex_lock(&swap_slots_cache_enable_mutex);
if (!swap_slot_cache_initialized) {
......@@ -251,14 +371,14 @@ void enable_swap_slots_cache(void)
swap_slot_cache_initialized = true;
}
alloc_swap_slot_cache_type(type);
__reenable_swap_slots_cache();
out_unlock:
mutex_unlock(&swap_slots_cache_enable_mutex);
}
/* called with swap slot cache's alloc lock held */
static int refill_swap_slots_cache(struct swap_slots_cache *cache)
static int refill_swap_slots_cache(struct swap_slots_cache *cache, int type)
{
if (!use_swap_slot_cache || cache->nr)
return 0;
......@@ -266,7 +386,7 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache)
cache->cur = 0;
if (swap_slot_cache_active)
cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE,
cache->slots, 1);
cache->slots, 1, type);
return cache->nr;
}
......@@ -307,12 +427,17 @@ swp_entry_t get_swap_page(struct page *page)
{
swp_entry_t entry;
struct swap_slots_cache *cache;
int type;
entry.val = 0;
type = memcg_get_swap_type(page);
if (type == SWAP_TYPE_NONE)
goto out;
if (PageTransHuge(page)) {
if (IS_ENABLED(CONFIG_THP_SWAP))
get_swap_pages(1, &entry, HPAGE_PMD_NR);
get_swap_pages(1, &entry, HPAGE_PMD_NR, type);
goto out;
}
......@@ -325,7 +450,7 @@ swp_entry_t get_swap_page(struct page *page)
* The alloc path here does not touch cache->slots_ret
* so cache->free_lock is not taken.
*/
cache = raw_cpu_ptr(&swp_slots);
cache = get_slots_cache(type);
if (likely(check_cache_active() && cache->slots)) {
mutex_lock(&cache->alloc_lock);
......@@ -335,7 +460,7 @@ swp_entry_t get_swap_page(struct page *page)
entry = cache->slots[cache->cur];
cache->slots[cache->cur++].val = 0;
cache->nr--;
} else if (refill_swap_slots_cache(cache)) {
} else if (refill_swap_slots_cache(cache, type)) {
goto repeat;
}
}
......@@ -344,7 +469,7 @@ swp_entry_t get_swap_page(struct page *page)
goto out;
}
get_swap_pages(1, &entry, 1);
get_swap_pages(1, &entry, 1, type);
out:
if (mem_cgroup_try_charge_swap(page, entry)) {
put_swap_page(page, entry);
......
......@@ -1056,7 +1056,97 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
}
int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
#ifdef CONFIG_MEMCG_SWAP_QOS
int write_swapfile_for_memcg(struct address_space *mapping, int *swap_type)
{
struct swap_info_struct *si;
unsigned int type;
int ret = -EINVAL;
spin_lock(&swap_lock);
for (type = 0; type < nr_swapfiles; type++) {
si = swap_info[type];
if ((si->flags & SWP_WRITEOK) &&
(si->swap_file->f_mapping == mapping)) {
WRITE_ONCE(*swap_type, type);
ret = 0;
break;
}
}
spin_unlock(&swap_lock);
return ret;
}
void read_swapfile_for_memcg(struct seq_file *m, int type)
{
struct swap_info_struct *si;
spin_lock(&swap_lock);
if (type < nr_swapfiles) {
si = swap_info[type];
if (si->flags & SWP_WRITEOK) {
seq_file_path(m, si->swap_file, "\t\n\\");
seq_printf(m, "\n");
}
}
spin_unlock(&swap_lock);
}
long get_nr_swap_pages_type(int type)
{
struct swap_info_struct *si;
long nr_swap_pages = 0;
spin_lock(&swap_lock);
if (type < nr_swapfiles) {
si = swap_info[type];
if (si->flags & SWP_WRITEOK)
nr_swap_pages = si->pages - si->inuse_pages;
}
spin_unlock(&swap_lock);
return nr_swap_pages;
}
static long get_avail_pages(unsigned long size, int type)
{
long avail_pgs = 0;
if (type == SWAP_TYPE_ALL)
return atomic_long_read(&nr_swap_pages) / size;
spin_unlock(&swap_avail_lock);
avail_pgs = get_nr_swap_pages_type(type) / size;
spin_lock(&swap_avail_lock);
return avail_pgs;
}
static inline bool should_skip_swap_type(int swap_type, int type)
{
if (type == SWAP_TYPE_ALL)
return false;
return (type != swap_type);
}
#else
long get_nr_swap_pages_type(int type)
{
return 0;
}
static inline long get_avail_pages(unsigned long size, int type)
{
return atomic_long_read(&nr_swap_pages) / size;
}
static inline bool should_skip_swap_type(int swap_type, int type)
{
return false;
}
#endif
int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size,
int type)
{
unsigned long size = swap_entry_size(entry_size);
struct swap_info_struct *si, *next;
......@@ -1069,7 +1159,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
spin_lock(&swap_avail_lock);
avail_pgs = atomic_long_read(&nr_swap_pages) / size;
avail_pgs = get_avail_pages(size, type);
if (avail_pgs <= 0) {
spin_unlock(&swap_avail_lock);
goto noswap;
......@@ -1086,6 +1176,11 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
spin_unlock(&swap_avail_lock);
spin_lock(&si->lock);
if (should_skip_swap_type(si->type, type)) {
spin_unlock(&si->lock);
spin_lock(&swap_avail_lock);
goto nextsi;
}
if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
spin_lock(&swap_avail_lock);
if (plist_node_empty(&si->avail_lists[node])) {
......@@ -2703,6 +2798,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
cluster_info = p->cluster_info;
p->cluster_info = NULL;
frontswap_map = frontswap_map_get(p);
memcg_remove_swapfile(p->type);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
arch_swap_invalidate_area(p->type);
......@@ -3457,7 +3553,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (inode)
inode_unlock(inode);
if (!error)
enable_swap_slots_cache();
enable_swap_slots_cache(p->type);
return error;
}
......
......@@ -103,6 +103,12 @@ struct scan_control {
/* Can pages be swapped as part of reclaim? */
unsigned int may_swap:1;
/* Should skip file pages? */
unsigned int not_file:1;
/* Proactive reclaim invoked by userspace through memory.reclaim */
unsigned int proactive:1;
/*
* Cgroup memory below memory.low is protected as long as we
* don't threaten to OOM. If any cgroup is reclaimed at
......@@ -2461,6 +2467,11 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
unsigned long ap, fp;
enum lru_list lru;
if (sc->not_file) {
scan_balance = SCAN_ANON;
goto out;
}
/* If we have no swap space, do not bother scanning anon pages. */
if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
scan_balance = SCAN_FILE;
......@@ -2880,9 +2891,10 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
sc->priority);
/* Record the group's reclaim efficiency */
vmpressure(sc->gfp_mask, memcg, false,
sc->nr_scanned - scanned,
sc->nr_reclaimed - reclaimed);
if (!sc->proactive)
vmpressure(sc->gfp_mask, memcg, false,
sc->nr_scanned - scanned,
sc->nr_reclaimed - reclaimed);
} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
}
......@@ -3005,9 +3017,10 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
}
/* Record the subtree's reclaim efficiency */
vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);
if (!sc->proactive)
vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);
if (sc->nr_reclaimed - nr_reclaimed)
reclaimable = true;
......@@ -3252,8 +3265,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
do {
vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
sc->priority);
if (!sc->proactive)
vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
sc->priority);
sc->nr_scanned = 0;
shrink_zones(zonelist, sc);
......@@ -3562,7 +3576,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages,
gfp_t gfp_mask,
bool may_swap)
unsigned int reclaim_options)
{
unsigned long nr_reclaimed;
unsigned int noreclaim_flag;
......@@ -3575,7 +3589,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.priority = DEF_PRIORITY,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = may_swap,
.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
.not_file = !!(reclaim_options & MEMCG_RECLAIM_NOT_FILE),
};
/*
* Traverse the ZONELIST_FALLBACK zonelist of the current node to put
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册