提交 62015863 编写于 作者: Y Yu Liao 提交者: Zheng Zengkai

mm: Add sysctl to clear free list pages

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S
CVE: NA

--------------------------------

This patch add sysctl to clear pages in free lists of each NUMA node.
For each NUMA node, clear each page in the free list, these work is
scheduled on a random CPU of the NUMA node.

When kasan is enabled and the pages are free, the shadow memory will be
filled with 0xFF, writing these free pages will cause UAF, so just
disable KASAN for clear freelist.

In the case of large memory, the clear freelist will hold zone lock
for a long time. As a result, the process may be blocked unless clear
freelist thread exit, and causing the system to be reset by the watchdog.

Provide a mechanism to stop clear freelist threads when elapsed time
exceeds cfp_timeout, which can be set by module_param().
Signed-off-by: NYu Liao <liaoyu15@huawei.com>
Reviewed-by: NKefeng Wang <wangkefeng.wang@huawei.com>
上级 26bffd68
...@@ -539,6 +539,10 @@ ...@@ -539,6 +539,10 @@
cio_ignore= [S390] cio_ignore= [S390]
See Documentation/s390/common_io.rst for details. See Documentation/s390/common_io.rst for details.
clear_freelist
Enable clear_freelist feature.
clk_ignore_unused clk_ignore_unused
[CLK] [CLK]
Prevents the clock framework from automatically gating Prevents the clock framework from automatically gating
......
...@@ -25,6 +25,7 @@ files can be found in mm/swap.c. ...@@ -25,6 +25,7 @@ files can be found in mm/swap.c.
Currently, these files are in /proc/sys/vm: Currently, these files are in /proc/sys/vm:
- admin_reserve_kbytes - admin_reserve_kbytes
- clear_freelist_pages
- compact_memory - compact_memory
- compaction_proactiveness - compaction_proactiveness
- compact_unevictable_allowed - compact_unevictable_allowed
...@@ -109,6 +110,18 @@ On x86_64 this is about 128MB. ...@@ -109,6 +110,18 @@ On x86_64 this is about 128MB.
Changing this takes effect whenever an application requests memory. Changing this takes effect whenever an application requests memory.
clear_freelist_pages
====================
Available only when CONFIG_CLEAR_FREELIST_PAGE is set. When 1 is written to the
file, all pages in free lists will be written with 0.
Zone lock is held during clear_freelist_pages, if the execution time is too
long, RCU CPU Stall warnings will be print. For each NUMA node,
clear_freelist_pages is performed on a "random" CPU of the NUMA node.
The time consuming is related to the hardware.
compact_memory compact_memory
============== ==============
......
...@@ -970,6 +970,19 @@ config MEMORY_RELIABLE ...@@ -970,6 +970,19 @@ config MEMORY_RELIABLE
To enable this function, mirrored memory is needed and To enable this function, mirrored memory is needed and
"kernelcore=reliable" need to be added in kernel parameters. "kernelcore=reliable" need to be added in kernel parameters.
config CLEAR_FREELIST_PAGE
bool "Support for clear free list pages"
depends on MMU && SYSCTL
default n
help
Say y here to enable the clear free list pages feature. When
writing to clear_freelist, trigger to clean up the free memory
of the buddy system.
To enable this feature, kernel parameter "clear_freelist" also
needs to be added.
source "mm/damon/Kconfig" source "mm/damon/Kconfig"
endmenu endmenu
...@@ -7,6 +7,7 @@ KASAN_SANITIZE_slab_common.o := n ...@@ -7,6 +7,7 @@ KASAN_SANITIZE_slab_common.o := n
KASAN_SANITIZE_slab.o := n KASAN_SANITIZE_slab.o := n
KASAN_SANITIZE_slub.o := n KASAN_SANITIZE_slub.o := n
KCSAN_SANITIZE_kmemleak.o := n KCSAN_SANITIZE_kmemleak.o := n
KASAN_SANITIZE_clear_freelist_page.o := n
# These produce frequent data race reports: most of them are due to races on # These produce frequent data race reports: most of them are due to races on
# the same word but accesses to different bits of that word. Re-enable KCSAN # the same word but accesses to different bits of that word. Re-enable KCSAN
...@@ -129,4 +130,5 @@ obj-$(CONFIG_PIN_MEMORY) += pin_mem.o ...@@ -129,4 +130,5 @@ obj-$(CONFIG_PIN_MEMORY) += pin_mem.o
obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o
obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o
obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o
obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o
obj-$(CONFIG_PAGE_CACHE_LIMIT) += page_cache_limit.o obj-$(CONFIG_PAGE_CACHE_LIMIT) += page_cache_limit.o
// SPDX-License-Identifier: GPL-2.0
/*
* Support for clear free list pages.
*/
#include <linux/mmzone.h>
#include <linux/mm_types.h>
#include <linux/mm.h>
#include <linux/sysctl.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/sched.h>
#include <linux/atomic.h>
#include <linux/nmi.h>
#include <linux/sched/clock.h>
#include <linux/module.h>
#define CFP_DEFAULT_TIMEOUT 2000
#define for_each_populated_zone_pgdat(pgdat, zone) \
for (zone = pgdat->node_zones; \
zone; \
zone = next_pgdat_zone(zone)) \
if (!populated_zone(zone)) \
; /* do nothing */ \
else
struct pgdat_entry {
struct pglist_data *pgdat;
struct work_struct work;
};
static DECLARE_WAIT_QUEUE_HEAD(clear_freelist_wait);
static DEFINE_MUTEX(clear_freelist_lock);
static atomic_t clear_freelist_workers;
static atomic_t clear_pages_num;
static ulong cfp_timeout_ms = CFP_DEFAULT_TIMEOUT;
/*
* next_pgdat_zone - helper magic for for_each_populated_zone_pgdat()
*/
static struct zone *next_pgdat_zone(struct zone *zone)
{
pg_data_t *pgdat = zone->zone_pgdat;
if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
zone++;
else
zone = NULL;
return zone;
}
static void clear_pgdat_freelist_pages(struct work_struct *work)
{
struct pgdat_entry *entry = container_of(work, struct pgdat_entry, work);
u64 cfp_timeout_ns = cfp_timeout_ms * NSEC_PER_MSEC;
struct pglist_data *pgdat = entry->pgdat;
unsigned long flags, order, t;
struct page *page;
struct zone *zone;
u64 start, now;
start = sched_clock();
for_each_populated_zone_pgdat(pgdat, zone) {
spin_lock_irqsave(&zone->lock, flags);
for_each_migratetype_order(order, t) {
list_for_each_entry(page, &zone->free_area[order].free_list[t], lru) {
now = sched_clock();
if (unlikely(now - start > cfp_timeout_ns)) {
spin_unlock_irqrestore(&zone->lock, flags);
goto out;
}
#ifdef CONFIG_KMAP_LOCAL
int i;
/* Clear highmem by clear_highpage() */
for (i = 0; i < (1 << order); i++)
clear_highpage(page + i);
#else
memset(page_address(page), 0, (1 << order) * PAGE_SIZE);
#endif
touch_nmi_watchdog();
atomic_add(1 << order, &clear_pages_num);
}
}
spin_unlock_irqrestore(&zone->lock, flags);
cond_resched();
}
out:
kfree(entry);
if (atomic_dec_and_test(&clear_freelist_workers))
wake_up(&clear_freelist_wait);
}
static void init_clear_freelist_work(struct pglist_data *pgdat)
{
struct pgdat_entry *entry;
entry = kzalloc(sizeof(struct pgdat_entry), GFP_KERNEL);
if (!entry)
return;
entry->pgdat = pgdat;
INIT_WORK(&entry->work, clear_pgdat_freelist_pages);
queue_work_node(pgdat->node_id, system_unbound_wq, &entry->work);
}
static void clear_freelist_pages(void)
{
struct pglist_data *pgdat;
mutex_lock(&clear_freelist_lock);
drain_all_pages(NULL);
for_each_online_pgdat(pgdat) {
atomic_inc(&clear_freelist_workers);
init_clear_freelist_work(pgdat);
}
wait_event(clear_freelist_wait, atomic_read(&clear_freelist_workers) == 0);
pr_debug("Cleared pages %d\nFree pages %lu\n", atomic_read(&clear_pages_num),
global_zone_page_state(NR_FREE_PAGES));
atomic_set(&clear_pages_num, 0);
mutex_unlock(&clear_freelist_lock);
}
static int sysctl_clear_freelist_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
int val;
table->data = &val;
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write)
clear_freelist_pages();
return ret;
}
static struct ctl_table clear_freelist_table[] = {
{
.procname = "clear_freelist_pages",
.data = NULL,
.maxlen = sizeof(int),
.mode = 0200,
.proc_handler = &sysctl_clear_freelist_handler,
.extra1 = SYSCTL_ONE,
.extra2 = SYSCTL_ONE,
},
{ }
};
static struct ctl_table sys_ctl_table[] = {
{
.procname = "vm",
.mode = 0555,
.child = clear_freelist_table,
},
{ }
};
static bool clear_freelist_enabled;
static int __init setup_clear_freelist(char *str)
{
clear_freelist_enabled = true;
return 1;
}
__setup("clear_freelist", setup_clear_freelist);
static int __init clear_freelist_init(void)
{
if (clear_freelist_enabled)
register_sysctl_table(sys_ctl_table);
return 0;
}
module_init(clear_freelist_init);
module_param(cfp_timeout_ms, ulong, 0644);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册