提交 e3452806 编写于 作者: G Guo Fan 提交者: Cheng Jian

userswap: add a new flag 'MAP_REPLACE' for mmap()

hulk inclusion
category: feature
bugzilla: 47439
CVE: NA

-------------------------------------------------

To make sure there are no other userspace threads access the memory
region we are swapping out, we need unmmap the memory region, map it
to a new address and use the new address to perform the swapout. We add
a new flag 'MAP_REPLACE' for mmap() to unmap the pages of the input
parameter 'VA' and remap them to a new tmpVA.
Signed-off-by: NGuo Fan <guofan5@huawei.com>
Signed-off-by: NXiongfeng Wang <wangxiongfeng2@huawei.com>
Reviewed-by: NKefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: NJing Xiangfeng <jingxiangfeng@huawei.com>
Signed-off-by: NYang Yingliang <yangyingliang@huawei.com>
Signed-off-by: NCheng Jian <cj.chengjian@huawei.com>
上级 8bc9bb27
...@@ -665,6 +665,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) ...@@ -665,6 +665,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_PKEY_BIT4)] = "", [ilog2(VM_PKEY_BIT4)] = "",
#endif #endif
#endif /* CONFIG_ARCH_HAS_PKEYS */ #endif /* CONFIG_ARCH_HAS_PKEYS */
#ifdef CONFIG_USERSWAP
[ilog2(VM_USWAP)] = "us",
#endif
}; };
size_t i; size_t i;
......
...@@ -236,6 +236,11 @@ extern unsigned int kobjsize(const void *objp); ...@@ -236,6 +236,11 @@ extern unsigned int kobjsize(const void *objp);
#define VM_CHECKNODE 0x200000000 #define VM_CHECKNODE 0x200000000
#ifdef CONFIG_USERSWAP
/* bit[32:36] is the protection key of intel, so use a large value for VM_USWAP */
#define VM_USWAP 0x2000000000000000
#endif
#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
#define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */
......
...@@ -52,6 +52,16 @@ static inline int current_is_kswapd(void) ...@@ -52,6 +52,16 @@ static inline int current_is_kswapd(void)
* actions on faults. * actions on faults.
*/ */
/*
* Userswap entry type
*/
#ifdef CONFIG_USERSWAP
#define SWP_USERSWAP_NUM 1
#define SWP_USERSWAP_ENTRY (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+SWP_DEVICE_NUM)
#else
#define SWP_USERSWAP_NUM 0
#endif
/* /*
* Unaddressable device memory support. See include/linux/hmm.h and * Unaddressable device memory support. See include/linux/hmm.h and
* Documentation/vm/hmm.rst. Short description is we need struct pages for * Documentation/vm/hmm.rst. Short description is we need struct pages for
...@@ -92,7 +102,7 @@ static inline int current_is_kswapd(void) ...@@ -92,7 +102,7 @@ static inline int current_is_kswapd(void)
#define MAX_SWAPFILES \ #define MAX_SWAPFILES \
((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
SWP_MIGRATION_NUM - SWP_HWPOISON_NUM) SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - SWP_USERSWAP_NUM)
/* /*
* Magic header for a swap area. The first part of the union is * Magic header for a swap area. The first part of the union is
......
...@@ -130,6 +130,12 @@ IF_HAVE_PG_IDLE(PG_idle, "idle"), \ ...@@ -130,6 +130,12 @@ IF_HAVE_PG_IDLE(PG_idle, "idle"), \
#define IF_HAVE_VM_SOFTDIRTY(flag,name) #define IF_HAVE_VM_SOFTDIRTY(flag,name)
#endif #endif
#ifdef CONFIG_USERSWAP
#define IF_HAVE_VM_USWAP(flag,name) {flag, name },
#else
#define IF_HAVE_VM_USWAP(flag,name)
#endif
#define __def_vmaflag_names \ #define __def_vmaflag_names \
{VM_READ, "read" }, \ {VM_READ, "read" }, \
{VM_WRITE, "write" }, \ {VM_WRITE, "write" }, \
...@@ -161,6 +167,7 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ ...@@ -161,6 +167,7 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \
{VM_MIXEDMAP, "mixedmap" }, \ {VM_MIXEDMAP, "mixedmap" }, \
{VM_HUGEPAGE, "hugepage" }, \ {VM_HUGEPAGE, "hugepage" }, \
{VM_NOHUGEPAGE, "nohugepage" }, \ {VM_NOHUGEPAGE, "nohugepage" }, \
IF_HAVE_VM_USWAP(VM_USWAP, "userswap" ) \
{VM_MERGEABLE, "mergeable" } \ {VM_MERGEABLE, "mergeable" } \
#define show_vma_flags(flags) \ #define show_vma_flags(flags) \
......
...@@ -17,6 +17,10 @@ ...@@ -17,6 +17,10 @@
#define MAP_SYNC 0x80000 /* perform synchronous page faults for the mapping */ #define MAP_SYNC 0x80000 /* perform synchronous page faults for the mapping */
#define MAP_PA32BIT 0x400000 /* physical address is within 4G */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */
#ifdef CONFIG_USERSWAP
#define MAP_REPLACE 0x1000000
#endif
/* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */ /* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */
#define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_CURRENT 1 /* lock all current mappings */
......
...@@ -503,6 +503,15 @@ config SHRINK_PAGECACHE ...@@ -503,6 +503,15 @@ config SHRINK_PAGECACHE
if unsure, say N to disable the SHRINK_PAGECACHE. if unsure, say N to disable the SHRINK_PAGECACHE.
config USERSWAP
bool "Enable User Swap"
depends on MMU && USERFAULTFD
depends on X86 || ARM64
default n
help
Support for User Swap. This is based on userfaultfd. We can implement
our own swapout and swapin functions in usersapce.
config CMA config CMA
bool "Contiguous Memory Allocator" bool "Contiguous Memory Allocator"
depends on HAVE_MEMBLOCK && MMU depends on HAVE_MEMBLOCK && MMU
......
...@@ -46,6 +46,7 @@ ...@@ -46,6 +46,7 @@
#include <linux/pkeys.h> #include <linux/pkeys.h>
#include <linux/oom.h> #include <linux/oom.h>
#include <linux/sched/mm.h> #include <linux/sched/mm.h>
#include <linux/swapops.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <asm/cacheflush.h> #include <asm/cacheflush.h>
...@@ -1372,6 +1373,169 @@ int unregister_mmap_notifier(struct notifier_block *nb) ...@@ -1372,6 +1373,169 @@ int unregister_mmap_notifier(struct notifier_block *nb)
EXPORT_SYMBOL_GPL(unregister_mmap_notifier); EXPORT_SYMBOL_GPL(unregister_mmap_notifier);
#endif #endif
#ifdef CONFIG_USERSWAP
/*
* Check if pages between 'addr ~ addr+len' can be user swapped. If so, get
* the reference of the pages and return the pages through input parameters
* 'ppages'.
*/
int pages_can_be_swapped(struct mm_struct *mm, unsigned long addr,
unsigned long len, struct page ***ppages)
{
struct vm_area_struct *vma;
struct page *page = NULL;
struct page **pages = NULL;
unsigned long addr_start, addr_end;
unsigned long ret;
int i, page_num = 0;
pages = kmalloc(sizeof(struct page *) * (len / PAGE_SIZE), GFP_KERNEL);
if (!pages)
return -ENOMEM;
addr_start = addr;
addr_end = addr + len;
while (addr < addr_end) {
vma = find_vma(mm, addr);
if (!vma || !vma_is_anonymous(vma) ||
(vma->vm_flags & VM_LOCKED) || vma->vm_file
|| (vma->vm_flags & VM_STACK) || (vma->vm_flags & (VM_IO | VM_PFNMAP))) {
ret = -EINVAL;
goto out;
}
if (!(vma->vm_flags & VM_UFFD_MISSING)) {
ret = -EAGAIN;
goto out;
}
get_again:
/* follow_page will inc page ref, dec the ref after we remap the page */
page = follow_page(vma, addr, FOLL_GET);
if (IS_ERR_OR_NULL(page)) {
ret = -ENODEV;
goto out;
}
pages[page_num] = page;
page_num++;
if (!PageAnon(page) || !PageSwapBacked(page) || PageHuge(page) || PageSwapCache(page)) {
ret = -EINVAL;
goto out;
} else if (PageTransCompound(page)) {
if (trylock_page(page)) {
if (!split_huge_page(page)) {
put_page(page);
page_num--;
unlock_page(page);
goto get_again;
} else {
unlock_page(page);
ret = -EINVAL;
goto out;
}
} else {
ret = -EINVAL;
goto out;
}
}
if (page_mapcount(page) > 1 || page_mapcount(page) + 1 != page_count(page)) {
ret = -EBUSY;
goto out;
}
addr += PAGE_SIZE;
}
*ppages = pages;
return 0;
out:
for (i = 0; i < page_num; i++)
put_page(pages[i]);
if (pages)
kfree(pages);
*ppages = NULL;
return ret;
}
/*
* In uswap situation, we use the bit 0 of the returned address to indicate
* whether the pages are dirty.
*/
#define USWAP_PAGES_DIRTY 1
/* unmap the pages between 'addr ~ addr+len' and remap them to a new address */
unsigned long do_user_swap(struct mm_struct *mm, unsigned long addr_start,
unsigned long len, struct page **pages, unsigned long new_addr)
{
struct vm_area_struct *vma;
struct page *page;
pmd_t *pmd;
pte_t *pte, old_pte;
spinlock_t *ptl;
unsigned long addr, addr_end;
bool pages_dirty = false;
int i, err;
addr_end = addr_start + len;
lru_add_drain();
mmu_notifier_invalidate_range_start(mm, addr_start, addr_end);
addr = addr_start;
i = 0;
while (addr < addr_end) {
page = pages[i];
vma = find_vma(mm, addr);
if (!vma) {
mmu_notifier_invalidate_range_end(mm, addr_start, addr_end);
WARN_ON("find_vma failed\n");
return -EINVAL;
}
pmd = mm_find_pmd(mm, addr);
if (!pmd) {
mmu_notifier_invalidate_range_end(mm, addr_start, addr_end);
WARN_ON("mm_find_pmd failed, addr:%llx\n");
return -ENXIO;
}
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
flush_cache_page(vma, addr, pte_pfn(*pte));
old_pte = ptep_clear_flush(vma, addr, pte);
if (pte_dirty(old_pte) || PageDirty(page))
pages_dirty = true;
set_pte(pte, swp_entry_to_pte(swp_entry(SWP_USERSWAP_ENTRY, page_to_pfn(page))));
dec_mm_counter(mm, MM_ANONPAGES);
page_remove_rmap(page, false);
put_page(page);
pte_unmap_unlock(pte, ptl);
vma->vm_flags |= VM_USWAP;
page->mapping = NULL;
addr += PAGE_SIZE;
i++;
}
mmu_notifier_invalidate_range_end(mm, addr_start, addr_end);
addr_start = new_addr;
addr_end = new_addr + len;
addr = addr_start;
vma = find_vma(mm, addr);
i = 0;
while (addr < addr_end) {
page = pages[i];
if (addr > vma->vm_end - 1)
vma = find_vma(mm, addr);
err = vm_insert_page(vma, addr, page);
if (err) {
pr_err("vm_insert_page failed:%d\n", err);
}
i++;
addr += PAGE_SIZE;
}
vma->vm_flags |= VM_USWAP;
if (pages_dirty)
new_addr = new_addr | USWAP_PAGES_DIRTY;
return new_addr;
}
#endif
/* /*
* The caller must hold down_write(&current->mm->mmap_sem). * The caller must hold down_write(&current->mm->mmap_sem).
*/ */
...@@ -1383,6 +1547,12 @@ unsigned long do_mmap(struct file *file, unsigned long addr, ...@@ -1383,6 +1547,12 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
{ {
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
int pkey = 0; int pkey = 0;
#ifdef CONFIG_USERSWAP
struct page **pages = NULL;
unsigned long addr_start = addr;
int i, page_num = 0;
unsigned long ret;
#endif
*populate = 0; *populate = 0;
...@@ -1399,6 +1569,17 @@ unsigned long do_mmap(struct file *file, unsigned long addr, ...@@ -1399,6 +1569,17 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (!(file && path_noexec(&file->f_path))) if (!(file && path_noexec(&file->f_path)))
prot |= PROT_EXEC; prot |= PROT_EXEC;
#ifdef CONFIG_USERSWAP
if (flags & MAP_REPLACE) {
if (offset_in_page(addr) || (len % PAGE_SIZE))
return -EINVAL;
page_num = len / PAGE_SIZE;
ret = pages_can_be_swapped(mm, addr, len, &pages);
if (ret)
return ret;
}
#endif
/* force arch specific MAP_FIXED handling in get_unmapped_area */ /* force arch specific MAP_FIXED handling in get_unmapped_area */
if (flags & MAP_FIXED_NOREPLACE) if (flags & MAP_FIXED_NOREPLACE)
flags |= MAP_FIXED; flags |= MAP_FIXED;
...@@ -1571,12 +1752,38 @@ unsigned long do_mmap(struct file *file, unsigned long addr, ...@@ -1571,12 +1752,38 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (flags & MAP_CHECKNODE) if (flags & MAP_CHECKNODE)
set_vm_checknode(&vm_flags, flags); set_vm_checknode(&vm_flags, flags);
#ifdef CONFIG_USERSWAP
/* mark the vma as special to avoid merging with other vmas */
if (flags & MAP_REPLACE)
vm_flags |= VM_SPECIAL;
#endif
addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
if (!IS_ERR_VALUE(addr) && if (!IS_ERR_VALUE(addr) &&
((vm_flags & VM_LOCKED) || ((vm_flags & VM_LOCKED) ||
(flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
*populate = len; *populate = len;
#ifndef CONFIG_USERSWAP
return addr; return addr;
#else
if (!(flags & MAP_REPLACE))
return addr;
if (IS_ERR_VALUE(addr)) {
pr_info("mmap_region failed, return addr:%lx\n", addr);
ret = addr;
goto out;
}
ret = do_user_swap(mm, addr_start, len, pages, addr);
out:
/* follow_page() above increased the reference*/
for (i = 0; i < page_num; i++)
put_page(pages[i]);
if (pages)
kfree(pages);
return ret;
#endif
} }
unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册