From 6174ecb523613c8ed8dcdc889d46f4c02f65b9e4 Mon Sep 17 00:00:00 2001
From: zhong jiang <zhongjiang@huawei.com>
Date: Fri, 15 Feb 2019 10:32:33 +0800
Subject: [PATCH] pagecache: add sysctl interface to limit pagecache

euleros inclusion
category: feature
feature: pagecache limit

add proc sysctl interface to set pagecache limit for reclaim memory

Signed-off-by: zhong jiang <zhongjiang@huawei.com>
Reviewed-by: Jing xiangfeng <jingxiangfeng@huawei.com>
Signed-off-by: zhong jiang <zhongjiang@huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
---
 include/linux/pagemap.h | 18 +--------
 include/linux/swap.h    | 19 +++++++++
 kernel/sysctl.c         | 18 +++++++++
 mm/filemap.c            | 20 ++++++++++
 mm/page_alloc.c         | 68 ++++++++++++++++++++++++++++++++
 mm/vmscan.c             | 86 +++++++++++++++++++++++++++++++++++++++++
 6 files changed, 213 insertions(+), 16 deletions(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index b1bd2186e6d2..65245ce3557f 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -613,6 +613,8 @@ static inline int fault_in_pages_readable(const char __user *uaddr, int size)
 	return 0;
 }
 
+int add_to_page_cache(struct page *page, struct address_space *mapping,
+				pgoff_t index, gfp_t gfp_mask);
 int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 				pgoff_t index, gfp_t gfp_mask);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
@@ -623,22 +625,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
 void delete_from_page_cache_batch(struct address_space *mapping,
 				  struct pagevec *pvec);
 
-/*
- * Like add_to_page_cache_locked, but used to add newly allocated pages:
- * the page is new, so we can just run __SetPageLocked() against it.
- */
-static inline int add_to_page_cache(struct page *page,
-		struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
-{
-	int error;
-
-	__SetPageLocked(page);
-	error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
-	if (unlikely(error))
-		__ClearPageLocked(page);
-	return error;
-}
-
 static inline unsigned long dir_pages(struct inode *inode)
 {
 	return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >>
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 77221c16733a..d7046787c40d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -367,6 +367,25 @@ extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
 extern unsigned long vm_total_pages;
 
+extern unsigned long vm_cache_limit_ratio;
+extern unsigned long vm_cache_limit_ratio_min;
+extern unsigned long vm_cache_limit_ratio_max;
+extern unsigned long vm_cache_limit_mbytes;
+extern unsigned long vm_cache_limit_mbytes_min;
+extern unsigned long vm_cache_limit_mbytes_max;
+extern int vm_cache_reclaim_s;
+extern int vm_cache_reclaim_s_min;
+extern int vm_cache_reclaim_s_max;
+extern int vm_cache_reclaim_weight;
+extern int vm_cache_reclaim_weight_min;
+extern int vm_cache_reclaim_weight_max;
+extern unsigned long page_cache_over_limit(void);
+extern void shrink_page_cache(gfp_t mask);
+extern int cache_limit_ratio_sysctl_handler(struct ctl_table *table, int write,
+			void __user *buffer, size_t *length, loff_t *ppos);
+extern int cache_limit_mbytes_sysctl_handler(struct ctl_table *table, int write,
+			void __user *buffer, size_t *length, loff_t *ppos);
+
 #ifdef CONFIG_NUMA
 extern int node_reclaim_mode;
 extern int sysctl_min_unmapped_ratio;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 292e19af18d7..739da03342d2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1364,6 +1364,24 @@ static struct ctl_table vm_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
+	{
+		.procname       = "cache_reclaim_s",
+		.data           = &vm_cache_reclaim_s,
+		.maxlen         = sizeof(vm_cache_reclaim_s),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = &vm_cache_reclaim_s_min,
+		.extra2         = &vm_cache_reclaim_s_max,
+	},
+	{
+		.procname       = "cache_reclaim_weight",
+		.data           = &vm_cache_reclaim_weight,
+		.maxlen         = sizeof(vm_cache_reclaim_weight),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = &vm_cache_reclaim_weight_min,
+		.extra2         = &vm_cache_reclaim_weight_max,
+	},
 #ifdef CONFIG_HUGETLB_PAGE
 	{
 		.procname	= "nr_hugepages",
diff --git a/mm/filemap.c b/mm/filemap.c
index 52517f28e6f4..ac4b66869cca 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -895,6 +895,26 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 }
 EXPORT_SYMBOL(add_to_page_cache_locked);
 
+/*
+ * Like add_to_page_cache_locked, but used to add newly allocated pages:
+ * the page is new, so we can just run __SetPageLocked() against it.
+ */
+int add_to_page_cache(struct page *page,
+		struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
+{
+	int error;
+
+	if (vm_cache_limit_mbytes && page_cache_over_limit())
+		shrink_page_cache(gfp_mask);
+	__SetPageLocked(page);
+	error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
+	if (unlikely(error))
+		__ClearPageLocked(page);
+
+	return error;
+}
+EXPORT_SYMBOL(add_to_page_cache);
+
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 				pgoff_t offset, gfp_t gfp_mask)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a9de1dbb9a6c..19bf37971989 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8149,3 +8149,71 @@ bool set_hwpoison_free_buddy_page(struct page *page)
 	return hwpoisoned;
 }
 #endif
+
+unsigned long page_cache_over_limit(void)
+{
+	unsigned long lru_file, limit;
+
+	limit = vm_cache_limit_mbytes * ((1024 * 1024UL) / PAGE_SIZE);
+	lru_file = global_node_page_state(NR_ACTIVE_FILE)
+		 + global_node_page_state(NR_INACTIVE_FILE);
+	if (lru_file > limit)
+		return lru_file - limit;
+
+	return 0;
+}
+
+int cache_limit_ratio_sysctl_handler(struct ctl_table *table, int write,
+			void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int ret;
+
+	/* totalram_page may be changed after early boot */
+	vm_cache_limit_mbytes_max = totalram_pages >> (20 - PAGE_SHIFT);
+
+	ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+	if (ret)
+		return ret;
+	if (write) {
+		vm_cache_limit_mbytes = totalram_pages
+				* vm_cache_limit_ratio / 100
+				* PAGE_SIZE / (1024 * 1024UL);
+		if (vm_cache_limit_ratio)
+			pr_warn("page cache limit set to %lu%%\n",
+				vm_cache_limit_ratio);
+		else
+			pr_warn("page cache limit off\n");
+		while (vm_cache_limit_mbytes && page_cache_over_limit())
+			shrink_page_cache(GFP_KERNEL);
+	}
+
+	return 0;
+}
+
+int cache_limit_mbytes_sysctl_handler(struct ctl_table *table, int write,
+			void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int ret;
+
+	vm_cache_limit_mbytes_max = totalram_pages >> (20 - PAGE_SHIFT);
+
+	ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+	if (ret)
+		return ret;
+	if (write) {
+		vm_cache_limit_ratio = (vm_cache_limit_mbytes
+				* ((1024 * 1024UL) / PAGE_SIZE)
+				+ totalram_pages / 200)
+				* 100 / totalram_pages;
+		if (vm_cache_limit_mbytes)
+			pr_warn("page cache limit set to %luMB\n",
+				vm_cache_limit_mbytes);
+		else
+			pr_warn("page cache limit off\n");
+
+		while (vm_cache_limit_mbytes && page_cache_over_limit())
+			shrink_page_cache(GFP_KERNEL);
+	}
+
+	return 0;
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 961401c46334..993bcb02709a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -166,6 +166,20 @@ int vm_swappiness = 60;
  */
 unsigned long vm_total_pages;
 
+unsigned long vm_cache_limit_ratio;
+unsigned long vm_cache_limit_ratio_min;
+unsigned long vm_cache_limit_ratio_max;
+unsigned long vm_cache_limit_mbytes __read_mostly;
+unsigned long vm_cache_limit_mbytes_min;
+unsigned long vm_cache_limit_mbytes_max;
+int vm_cache_reclaim_s __read_mostly;
+int vm_cache_reclaim_s_min;
+int vm_cache_reclaim_s_max;
+int vm_cache_reclaim_weight __read_mostly;
+int vm_cache_reclaim_weight_min;
+int vm_cache_reclaim_weight_max;
+static DEFINE_PER_CPU(struct delayed_work, vmscan_work);
+
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
 
@@ -3513,6 +3527,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 
 	count_vm_event(PAGEOUTRUN);
 
+	if (vm_cache_limit_mbytes && page_cache_over_limit())
+		shrink_page_cache(GFP_KERNEL);
+
 	do {
 		unsigned long nr_reclaimed = sc.nr_reclaimed;
 		bool raise_priority = true;
@@ -3895,6 +3912,74 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 }
 #endif /* CONFIG_HIBERNATION */
 
+static unsigned long __shrink_page_cache(gfp_t mask)
+{
+	struct scan_control sc = {
+		.gfp_mask = current_gfp_context(mask),
+		.reclaim_idx = gfp_zone(mask),
+		.may_writepage = !laptop_mode,
+		.nr_to_reclaim = SWAP_CLUSTER_MAX *
+				 (unsigned long)vm_cache_reclaim_weight,
+		.may_unmap = 1,
+		.may_swap = 1,
+		.order = 0,
+		.priority = DEF_PRIORITY,
+		.target_mem_cgroup = NULL,
+		.nodemask = NULL,
+	};
+
+	struct zonelist *zonelist = node_zonelist(numa_node_id(), mask);
+
+	return do_try_to_free_pages(zonelist, &sc);
+}
+
+void shrink_page_cache(gfp_t mask)
+{
+	/* We reclaim the highmem zone too, it is useful for 32bit arch */
+	__shrink_page_cache(mask | __GFP_HIGHMEM);
+}
+
+static void shrink_page_cache_work(struct work_struct *w)
+{
+	struct delayed_work *work = to_delayed_work(w);
+
+	if (vm_cache_reclaim_s == 0) {
+		schedule_delayed_work(work, round_jiffies_relative(120 * HZ));
+		return;
+	}
+
+	shrink_page_cache(GFP_KERNEL);
+	schedule_delayed_work(work,
+		round_jiffies_relative((unsigned long)vm_cache_reclaim_s * HZ));
+}
+
+static void shrink_page_cache_init(void)
+{
+	int cpu;
+
+	vm_cache_limit_ratio = 0;
+	vm_cache_limit_ratio_min = 0;
+	vm_cache_limit_ratio_max = 100;
+	vm_cache_limit_mbytes = 0;
+	vm_cache_limit_mbytes_min = 0;
+	vm_cache_limit_mbytes_max = totalram_pages >> (20 - PAGE_SHIFT);
+	vm_cache_reclaim_s = 0;
+	vm_cache_reclaim_s_min = 0;
+	vm_cache_reclaim_s_max = 43200;
+	vm_cache_reclaim_weight = 1;
+	vm_cache_reclaim_weight_min = 1;
+	vm_cache_reclaim_weight_max = 100;
+
+	for_each_online_cpu(cpu) {
+		struct delayed_work *work = &per_cpu(vmscan_work, cpu);
+
+		INIT_DEFERRABLE_WORK(work, shrink_page_cache_work);
+		schedule_delayed_work_on(cpu, work,
+				__round_jiffies_relative(
+				(unsigned long)vm_cache_reclaim_s * HZ, cpu));
+	}
+}
+
 /* It's optimal to keep kswapds on the same CPUs as their memory, but
    not required for correctness.  So if the last cpu in a node goes
    away, we get changed to run anywhere: as the first one comes back,
@@ -3964,6 +4049,7 @@ static int __init kswapd_init(void)
 					"mm/vmscan:online", kswapd_cpu_online,
 					NULL);
 	WARN_ON(ret < 0);
+	shrink_page_cache_init();
 	return 0;
 }
 
-- 
GitLab