From 8b80549823a3a1466f6fa8b95a342e7f9ec86167 Mon Sep 17 00:00:00 2001 From: Yu Liao Date: Wed, 9 Feb 2022 15:36:41 +0800 Subject: [PATCH] mm: add sysctl to clear free list pages hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4SK3S CVE: NA -------------------------------- This patch add sysctl to clear pages in free lists of each NUMA node. For each NUMA node, clear each page in the free list, these work is scheduled on a random CPU of the NUMA node. When kasan is enabled and the pages are free, the shadow memory will be filled with 0xFF, writing these free pages will cause UAF, so just disable KASAN for clear freelist. Signed-off-by: Yu Liao Reviewed-by: Kefeng Wang Signed-off-by: Yang Yingliang --- Documentation/sysctl/vm.txt | 13 +++ mm/Kconfig | 7 ++ mm/Makefile | 2 + mm/clear_freelist_page.c | 163 ++++++++++++++++++++++++++++++++++++ 4 files changed, 185 insertions(+) create mode 100644 mm/clear_freelist_page.c diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 7d73882e2c27..8d824892d00d 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -20,6 +20,7 @@ Currently, these files are in /proc/sys/vm: - admin_reserve_kbytes - block_dump +- clear_freelist_pages - compact_memory - compact_unevictable_allowed - dirty_background_bytes @@ -104,6 +105,18 @@ information on block I/O debugging is in Documentation/laptops/laptop-mode.txt. ============================================================== +clear_freelist_pages + +Available only when CONFIG_CLEAR_FREELIST_PAGE is set. When 1 is written to the +file, all pages in free lists will be written with 0. + +Zone lock is held during clear_freelist_pages, if the execution time is too +long, RCU CPU Stall warnings will be print. For each NUMA node, +clear_freelist_pages is performed on a "random" CPU of the NUMA node. +The time consuming is related to the hardware. + +============================================================== + compact_memory Available only when CONFIG_COMPACTION is set. When 1 is written to the file, diff --git a/mm/Kconfig b/mm/Kconfig index 80d7b47ca9f5..3a38eb4a6f02 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -849,4 +849,11 @@ config MEMORY_RELIABLE To enable this function, mirrored memory is needed and "kernelcore=reliable" need to be added in kernel parameters. +config CLEAR_FREELIST_PAGE + bool "Support for clear free list pages" + depends on MMU && SYSCTL + default n + help + Say y here to enable the clear free list pages feature. + endmenu diff --git a/mm/Makefile b/mm/Makefile index 741f9c250914..38291476ce22 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -6,6 +6,7 @@ KASAN_SANITIZE_slab_common.o := n KASAN_SANITIZE_slab.o := n KASAN_SANITIZE_slub.o := n +KASAN_SANITIZE_clear_freelist_page.o := n # These files are disabled because they produce non-interesting and/or # flaky coverage that is not a function of syscall inputs. E.g. slab is out of @@ -110,3 +111,4 @@ obj-$(CONFIG_PIN_MEMORY) += pin_mem.o obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o +obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o diff --git a/mm/clear_freelist_page.c b/mm/clear_freelist_page.c new file mode 100644 index 000000000000..69975f458dc7 --- /dev/null +++ b/mm/clear_freelist_page.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Support for clear free list pages. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define for_each_populated_zone_pgdat(pgdat, zone) \ + for (zone = pgdat->node_zones; \ + zone; \ + zone = next_pgdat_zone(zone)) \ + if (!populated_zone(zone)) \ + ; /* do nothing */ \ + else + +struct pgdat_entry { + struct pglist_data *pgdat; + struct work_struct work; +}; + +static DECLARE_WAIT_QUEUE_HEAD(clear_freelist_wait); +static DEFINE_MUTEX(clear_freelist_lock); +static atomic_t clear_freelist_workers; +static atomic_t clear_pages_num; +static int one = 1; + +/* + * next_pgdat_zone - helper magic for for_each_populated_zone_pgdat() + */ +static struct zone *next_pgdat_zone(struct zone *zone) +{ + pg_data_t *pgdat = zone->zone_pgdat; + + if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) + zone++; + else + zone = NULL; + return zone; +} + +static void clear_pgdat_freelist_pages(struct work_struct *work) +{ + struct pgdat_entry *entry = container_of(work, struct pgdat_entry, work); + struct pglist_data *pgdat = entry->pgdat; + unsigned long flags, order, t; + struct page *page; + struct zone *zone; + + for_each_populated_zone_pgdat(pgdat, zone) { + spin_lock_irqsave(&zone->lock, flags); + for_each_migratetype_order(order, t) { + list_for_each_entry(page, &zone->free_area[order].free_list[t], lru) { +#ifdef CONFIG_KMAP_LOCAL + int i; + + /* Clear highmem by clear_highpage() */ + for (i = 0; i < (1 << order); i++) + clear_highpage(page + i); +#else + memset(page_address(page), 0, (1 << order) * PAGE_SIZE); +#endif + touch_nmi_watchdog(); + atomic_add(1 << order, &clear_pages_num); + } + } + spin_unlock_irqrestore(&zone->lock, flags); + + cond_resched(); + } + kfree(entry); + + if (atomic_dec_and_test(&clear_freelist_workers)) + wake_up(&clear_freelist_wait); +} + +static void init_clear_freelist_work(struct pglist_data *pgdat) +{ + struct pgdat_entry *entry; + + entry = kzalloc(sizeof(struct pgdat_entry), GFP_KERNEL); + if (!entry) + return; + + entry->pgdat = pgdat; + INIT_WORK(&entry->work, clear_pgdat_freelist_pages); + queue_work_node(pgdat->node_id, system_unbound_wq, &entry->work); +} + +static void clear_freelist_pages(void) +{ + struct pglist_data *pgdat; + + mutex_lock(&clear_freelist_lock); + drain_all_pages(NULL); + + for_each_online_pgdat(pgdat) { + atomic_inc(&clear_freelist_workers); + init_clear_freelist_work(pgdat); + } + + wait_event(clear_freelist_wait, atomic_read(&clear_freelist_workers) == 0); + + pr_debug("Cleared pages %d\nFree pages %lu\n", atomic_read(&clear_pages_num), + global_zone_page_state(NR_FREE_PAGES)); + atomic_set(&clear_pages_num, 0); + + mutex_unlock(&clear_freelist_lock); +} + +static int sysctl_clear_freelist_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + int val; + + table->data = &val; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (!ret && write) + clear_freelist_pages(); + + return ret; +} + +static struct ctl_table clear_freelist_table[] = { + { + .procname = "clear_freelist_pages", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = &sysctl_clear_freelist_handler, + .extra1 = &one, + .extra2 = &one, + }, + { } +}; + +static struct ctl_table sys_ctl_table[] = { + { + .procname = "vm", + .mode = 0555, + .child = clear_freelist_table, + }, + { } +}; + +static int __init clear_freelist_init(void) +{ + register_sysctl_table(sys_ctl_table); + + return 0; +} +module_init(clear_freelist_init); -- GitLab