From faa3fdcd5f4abcca1b37aec81bc0ba99a5ed39c1 Mon Sep 17 00:00:00 2001 From: Guo Fan Date: Fri, 16 Jul 2021 16:50:59 +0800 Subject: [PATCH] userswap: add a new flag 'MAP_REPLACE' for mmap() hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I40AXF CVE: NA -------------------------------------- To make sure there are no other userspace threads access the memory region we are swapping out, we need unmmap the memory region, map it to a new address and use the new address to perform the swapout. We add a new flag 'MAP_REPLACE' for mmap() to unmap the pages of the input parameter 'VA' and remap them to a new tmpVA. Signed-off-by: Guo Fan Signed-off-by: Xiongfeng Wang Reviewed-by: Kefeng Wang Signed-off-by: Yang Yingliang Signed-off-by: Xiongfeng Wang Reviewed-by: tong tiangen Signed-off-by: Zheng Zengkai --- fs/proc/task_mmu.c | 3 + include/linux/mm.h | 5 + include/linux/swap.h | 12 +- include/trace/events/mmflags.h | 7 + include/uapi/asm-generic/mman-common.h | 2 + mm/Kconfig | 9 ++ mm/mmap.c | 209 +++++++++++++++++++++++++ 7 files changed, 246 insertions(+), 1 deletion(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 328018d6cb49..4dc080939bdd 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -661,6 +661,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_PKEY_BIT4)] = "", #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ +#ifdef CONFIG_USERSWAP + [ilog2(VM_USWAP)] = "us", +#endif }; size_t i; diff --git a/include/linux/mm.h b/include/linux/mm.h index dc5f3647e76d..eba4edf054ae 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -298,6 +298,11 @@ extern unsigned int kobjsize(const void *objp); #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ +#ifdef CONFIG_USERSWAP +/* bit[32:36] is the protection key of intel, so use a large value for VM_USWAP */ +#define VM_USWAP 0x2000000000000000 +#endif + #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS #define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 724886936961..9b708c0288bc 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -53,6 +53,16 @@ static inline int current_is_kswapd(void) * actions on faults. */ +/* + * Userswap entry type + */ +#ifdef CONFIG_USERSWAP +#define SWP_USERSWAP_NUM 1 +#define SWP_USERSWAP_ENTRY (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+SWP_DEVICE_NUM) +#else +#define SWP_USERSWAP_NUM 0 +#endif + /* * Unaddressable device memory support. See include/linux/hmm.h and * Documentation/vm/hmm.rst. Short description is we need struct pages for @@ -93,7 +103,7 @@ static inline int current_is_kswapd(void) #define MAX_SWAPFILES \ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM) + SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - SWP_USERSWAP_NUM) /* * Magic header for a swap area. The first part of the union is diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 67018d367b9f..673c6f590800 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -137,6 +137,12 @@ IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" ) #define IF_HAVE_VM_SOFTDIRTY(flag,name) #endif +#ifdef CONFIG_USERSWAP +#define IF_HAVE_VM_USWAP(flag,name) {flag, name }, +#else +#define IF_HAVE_VM_USWAP(flag,name) +#endif + #define __def_vmaflag_names \ {VM_READ, "read" }, \ {VM_WRITE, "write" }, \ @@ -169,6 +175,7 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ {VM_MIXEDMAP, "mixedmap" }, \ {VM_HUGEPAGE, "hugepage" }, \ {VM_NOHUGEPAGE, "nohugepage" }, \ +IF_HAVE_VM_USWAP(VM_USWAP, "userswap" ) \ {VM_MERGEABLE, "mergeable" } \ #define show_vma_flags(flags) \ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index f94f65d429be..e75b65364dce 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -30,6 +30,8 @@ #define MAP_SYNC 0x080000 /* perform synchronous page faults for the mapping */ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ +#define MAP_REPLACE 0x1000000 + #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ diff --git a/mm/Kconfig b/mm/Kconfig index 398ddb19155e..f08be27b9cf0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -513,6 +513,15 @@ config ETMEM_SWAP help etmem page swap feature +config USERSWAP + bool "Enable User Swap" + depends on MMU && USERFAULTFD + depends on X86 || ARM64 + default n + help + Support for User Swap. This is based on userfaultfd. We can implement + our own swapout and swapin functions in usersapce. + config CMA bool "Contiguous Memory Allocator" depends on MMU diff --git a/mm/mmap.c b/mm/mmap.c index 5c8b4485860d..069d88aeeeba 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -1398,6 +1399,171 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode, return true; } +#ifdef CONFIG_USERSWAP +/* + * Check if pages between 'addr ~ addr+len' can be user swapped. If so, get + * the reference of the pages and return the pages through input parameters + * 'ppages'. + */ +int pages_can_be_swapped(struct mm_struct *mm, unsigned long addr, + unsigned long len, struct page ***ppages) +{ + struct vm_area_struct *vma; + struct page *page = NULL; + struct page **pages = NULL; + unsigned long addr_start, addr_end; + unsigned long ret; + int i, page_num = 0; + + pages = kmalloc(sizeof(struct page *) * (len / PAGE_SIZE), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + addr_start = addr; + addr_end = addr + len; + while (addr < addr_end) { + vma = find_vma(mm, addr); + if (!vma || !vma_is_anonymous(vma) || + (vma->vm_flags & VM_LOCKED) || vma->vm_file + || (vma->vm_flags & VM_STACK) || (vma->vm_flags & (VM_IO | VM_PFNMAP))) { + ret = -EINVAL; + goto out; + } + if (!(vma->vm_flags & VM_UFFD_MISSING)) { + ret = -EAGAIN; + goto out; + } +get_again: + /* follow_page will inc page ref, dec the ref after we remap the page */ + page = follow_page(vma, addr, FOLL_GET); + if (IS_ERR_OR_NULL(page)) { + ret = -ENODEV; + goto out; + } + pages[page_num] = page; + page_num++; + if (!PageAnon(page) || !PageSwapBacked(page) || PageHuge(page) || PageSwapCache(page)) { + ret = -EINVAL; + goto out; + } else if (PageTransCompound(page)) { + if (trylock_page(page)) { + if (!split_huge_page(page)) { + put_page(page); + page_num--; + unlock_page(page); + goto get_again; + } else { + unlock_page(page); + ret = -EINVAL; + goto out; + } + } else { + ret = -EINVAL; + goto out; + } + } + if (page_mapcount(page) > 1 || page_mapcount(page) + 1 != page_count(page)) { + ret = -EBUSY; + goto out; + } + addr += PAGE_SIZE; + } + + *ppages = pages; + return 0; + +out: + for (i = 0; i < page_num; i++) + put_page(pages[i]); + if (pages) + kfree(pages); + *ppages = NULL; + return ret; +} + +/* + * In uswap situation, we use the bit 0 of the returned address to indicate + * whether the pages are dirty. + */ +#define USWAP_PAGES_DIRTY 1 + +/* unmap the pages between 'addr ~ addr+len' and remap them to a new address */ +unsigned long do_user_swap(struct mm_struct *mm, unsigned long addr_start, + unsigned long len, struct page **pages, unsigned long new_addr) +{ + struct vm_area_struct *vma; + struct page *page; + struct mmu_notifier_range range; + pmd_t *pmd; + pte_t *pte, old_pte; + spinlock_t *ptl; + unsigned long addr, addr_end; + bool pages_dirty = false; + int i, err; + + addr_end = addr_start + len; + lru_add_drain(); + addr = addr_start; + i = 0; + while (addr < addr_end) { + page = pages[i]; + vma = find_vma(mm, addr); + if (!vma) { + WARN_ON("find_vma failed\n"); + return -EINVAL; + } + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, + vma->vm_mm, addr_start, addr_start + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); + pmd = mm_find_pmd(mm, addr); + if (!pmd) { + mmu_notifier_invalidate_range_end(&range); + WARN_ON("mm_find_pmd failed, addr:%llx\n"); + return -ENXIO; + } + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + flush_cache_page(vma, addr, pte_pfn(*pte)); + old_pte = ptep_clear_flush(vma, addr, pte); + if (pte_dirty(old_pte) || PageDirty(page)) + pages_dirty = true; + set_pte(pte, swp_entry_to_pte(swp_entry(SWP_USERSWAP_ENTRY, page_to_pfn(page)))); + dec_mm_counter(mm, MM_ANONPAGES); + page_remove_rmap(page, false); + put_page(page); + + pte_unmap_unlock(pte, ptl); + mmu_notifier_invalidate_range_end(&range); + vma->vm_flags |= VM_USWAP; + page->mapping = NULL; + addr += PAGE_SIZE; + i++; + } + + addr_start = new_addr; + addr_end = new_addr + len; + addr = addr_start; + vma = find_vma(mm, addr); + i = 0; + while (addr < addr_end) { + page = pages[i]; + if (addr > vma->vm_end - 1) + vma = find_vma(mm, addr); + err = vm_insert_page(vma, addr, page); + if (err) { + pr_err("vm_insert_page failed:%d\n", err); + } + i++; + addr += PAGE_SIZE; + } + vma->vm_flags |= VM_USWAP; + + if (pages_dirty) + new_addr = new_addr | USWAP_PAGES_DIRTY; + + return new_addr; +} +#endif + /* * The caller must write-lock current->mm->mmap_lock. */ @@ -1409,6 +1575,12 @@ unsigned long do_mmap(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; vm_flags_t vm_flags; int pkey = 0; +#ifdef CONFIG_USERSWAP + struct page **pages = NULL; + unsigned long addr_start = addr; + int i, page_num = 0; + unsigned long ret; +#endif *populate = 0; @@ -1425,6 +1597,17 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (!(file && path_noexec(&file->f_path))) prot |= PROT_EXEC; +#ifdef CONFIG_USERSWAP + if (flags & MAP_REPLACE) { + if (offset_in_page(addr) || (len % PAGE_SIZE)) + return -EINVAL; + page_num = len / PAGE_SIZE; + ret = pages_can_be_swapped(mm, addr, len, &pages); + if (ret) + return ret; + } +#endif + /* force arch specific MAP_FIXED handling in get_unmapped_area */ if (flags & MAP_FIXED_NOREPLACE) flags |= MAP_FIXED; @@ -1580,12 +1763,38 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_NORESERVE; } +#ifdef CONFIG_USERSWAP + /* mark the vma as special to avoid merging with other vmas */ + if (flags & MAP_REPLACE) + vm_flags |= VM_SPECIAL; +#endif + addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) *populate = len; +#ifndef CONFIG_USERSWAP return addr; +#else + if (!(flags & MAP_REPLACE)) + return addr; + + if (IS_ERR_VALUE(addr)) { + pr_info("mmap_region failed, return addr:%lx\n", addr); + ret = addr; + goto out; + } + + ret = do_user_swap(mm, addr_start, len, pages, addr); +out: + /* follow_page() above increased the reference*/ + for (i = 0; i < page_num; i++) + put_page(pages[i]); + if (pages) + kfree(pages); + return ret; +#endif } unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, -- GitLab