You need to sign in or sign up before continuing.
提交 faa3fdcd 编写于 作者: G Guo Fan 提交者: Zheng Zengkai

userswap: add a new flag 'MAP_REPLACE' for mmap()

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I40AXF
CVE: NA

--------------------------------------

To make sure there are no other userspace threads access the memory
region we are swapping out, we need unmmap the memory region, map it
to a new address and use the new address to perform the swapout. We add
a new flag 'MAP_REPLACE' for mmap() to unmap the pages of the input
parameter 'VA' and remap them to a new tmpVA.
Signed-off-by: NGuo Fan <guofan5@huawei.com>
Signed-off-by: NXiongfeng Wang <wangxiongfeng2@huawei.com>
Reviewed-by: NKefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: NYang Yingliang <yangyingliang@huawei.com>
Signed-off-by: NXiongfeng Wang <wangxiongfeng2@huawei.com>
Reviewed-by: Ntong tiangen <tongtiangen@huawei.com>
Signed-off-by: NZheng Zengkai <zhengzengkai@huawei.com>
上级 c78eb5c1
...@@ -661,6 +661,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) ...@@ -661,6 +661,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_PKEY_BIT4)] = "", [ilog2(VM_PKEY_BIT4)] = "",
#endif #endif
#endif /* CONFIG_ARCH_HAS_PKEYS */ #endif /* CONFIG_ARCH_HAS_PKEYS */
#ifdef CONFIG_USERSWAP
[ilog2(VM_USWAP)] = "us",
#endif
}; };
size_t i; size_t i;
......
...@@ -298,6 +298,11 @@ extern unsigned int kobjsize(const void *objp); ...@@ -298,6 +298,11 @@ extern unsigned int kobjsize(const void *objp);
#define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */
#define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */
#ifdef CONFIG_USERSWAP
/* bit[32:36] is the protection key of intel, so use a large value for VM_USWAP */
#define VM_USWAP 0x2000000000000000
#endif
#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
#define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */
......
...@@ -53,6 +53,16 @@ static inline int current_is_kswapd(void) ...@@ -53,6 +53,16 @@ static inline int current_is_kswapd(void)
* actions on faults. * actions on faults.
*/ */
/*
* Userswap entry type
*/
#ifdef CONFIG_USERSWAP
#define SWP_USERSWAP_NUM 1
#define SWP_USERSWAP_ENTRY (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+SWP_DEVICE_NUM)
#else
#define SWP_USERSWAP_NUM 0
#endif
/* /*
* Unaddressable device memory support. See include/linux/hmm.h and * Unaddressable device memory support. See include/linux/hmm.h and
* Documentation/vm/hmm.rst. Short description is we need struct pages for * Documentation/vm/hmm.rst. Short description is we need struct pages for
...@@ -93,7 +103,7 @@ static inline int current_is_kswapd(void) ...@@ -93,7 +103,7 @@ static inline int current_is_kswapd(void)
#define MAX_SWAPFILES \ #define MAX_SWAPFILES \
((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
SWP_MIGRATION_NUM - SWP_HWPOISON_NUM) SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - SWP_USERSWAP_NUM)
/* /*
* Magic header for a swap area. The first part of the union is * Magic header for a swap area. The first part of the union is
......
...@@ -137,6 +137,12 @@ IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" ) ...@@ -137,6 +137,12 @@ IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" )
#define IF_HAVE_VM_SOFTDIRTY(flag,name) #define IF_HAVE_VM_SOFTDIRTY(flag,name)
#endif #endif
#ifdef CONFIG_USERSWAP
#define IF_HAVE_VM_USWAP(flag,name) {flag, name },
#else
#define IF_HAVE_VM_USWAP(flag,name)
#endif
#define __def_vmaflag_names \ #define __def_vmaflag_names \
{VM_READ, "read" }, \ {VM_READ, "read" }, \
{VM_WRITE, "write" }, \ {VM_WRITE, "write" }, \
...@@ -169,6 +175,7 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ ...@@ -169,6 +175,7 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \
{VM_MIXEDMAP, "mixedmap" }, \ {VM_MIXEDMAP, "mixedmap" }, \
{VM_HUGEPAGE, "hugepage" }, \ {VM_HUGEPAGE, "hugepage" }, \
{VM_NOHUGEPAGE, "nohugepage" }, \ {VM_NOHUGEPAGE, "nohugepage" }, \
IF_HAVE_VM_USWAP(VM_USWAP, "userswap" ) \
{VM_MERGEABLE, "mergeable" } \ {VM_MERGEABLE, "mergeable" } \
#define show_vma_flags(flags) \ #define show_vma_flags(flags) \
......
...@@ -30,6 +30,8 @@ ...@@ -30,6 +30,8 @@
#define MAP_SYNC 0x080000 /* perform synchronous page faults for the mapping */ #define MAP_SYNC 0x080000 /* perform synchronous page faults for the mapping */
#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */
#define MAP_REPLACE 0x1000000
#define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be
* uninitialized */ * uninitialized */
......
...@@ -513,6 +513,15 @@ config ETMEM_SWAP ...@@ -513,6 +513,15 @@ config ETMEM_SWAP
help help
etmem page swap feature etmem page swap feature
config USERSWAP
bool "Enable User Swap"
depends on MMU && USERFAULTFD
depends on X86 || ARM64
default n
help
Support for User Swap. This is based on userfaultfd. We can implement
our own swapout and swapin functions in usersapce.
config CMA config CMA
bool "Contiguous Memory Allocator" bool "Contiguous Memory Allocator"
depends on MMU depends on MMU
......
...@@ -47,6 +47,7 @@ ...@@ -47,6 +47,7 @@
#include <linux/pkeys.h> #include <linux/pkeys.h>
#include <linux/oom.h> #include <linux/oom.h>
#include <linux/sched/mm.h> #include <linux/sched/mm.h>
#include <linux/swapops.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <asm/cacheflush.h> #include <asm/cacheflush.h>
...@@ -1398,6 +1399,171 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode, ...@@ -1398,6 +1399,171 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode,
return true; return true;
} }
#ifdef CONFIG_USERSWAP
/*
* Check if pages between 'addr ~ addr+len' can be user swapped. If so, get
* the reference of the pages and return the pages through input parameters
* 'ppages'.
*/
int pages_can_be_swapped(struct mm_struct *mm, unsigned long addr,
unsigned long len, struct page ***ppages)
{
struct vm_area_struct *vma;
struct page *page = NULL;
struct page **pages = NULL;
unsigned long addr_start, addr_end;
unsigned long ret;
int i, page_num = 0;
pages = kmalloc(sizeof(struct page *) * (len / PAGE_SIZE), GFP_KERNEL);
if (!pages)
return -ENOMEM;
addr_start = addr;
addr_end = addr + len;
while (addr < addr_end) {
vma = find_vma(mm, addr);
if (!vma || !vma_is_anonymous(vma) ||
(vma->vm_flags & VM_LOCKED) || vma->vm_file
|| (vma->vm_flags & VM_STACK) || (vma->vm_flags & (VM_IO | VM_PFNMAP))) {
ret = -EINVAL;
goto out;
}
if (!(vma->vm_flags & VM_UFFD_MISSING)) {
ret = -EAGAIN;
goto out;
}
get_again:
/* follow_page will inc page ref, dec the ref after we remap the page */
page = follow_page(vma, addr, FOLL_GET);
if (IS_ERR_OR_NULL(page)) {
ret = -ENODEV;
goto out;
}
pages[page_num] = page;
page_num++;
if (!PageAnon(page) || !PageSwapBacked(page) || PageHuge(page) || PageSwapCache(page)) {
ret = -EINVAL;
goto out;
} else if (PageTransCompound(page)) {
if (trylock_page(page)) {
if (!split_huge_page(page)) {
put_page(page);
page_num--;
unlock_page(page);
goto get_again;
} else {
unlock_page(page);
ret = -EINVAL;
goto out;
}
} else {
ret = -EINVAL;
goto out;
}
}
if (page_mapcount(page) > 1 || page_mapcount(page) + 1 != page_count(page)) {
ret = -EBUSY;
goto out;
}
addr += PAGE_SIZE;
}
*ppages = pages;
return 0;
out:
for (i = 0; i < page_num; i++)
put_page(pages[i]);
if (pages)
kfree(pages);
*ppages = NULL;
return ret;
}
/*
* In uswap situation, we use the bit 0 of the returned address to indicate
* whether the pages are dirty.
*/
#define USWAP_PAGES_DIRTY 1
/* unmap the pages between 'addr ~ addr+len' and remap them to a new address */
unsigned long do_user_swap(struct mm_struct *mm, unsigned long addr_start,
unsigned long len, struct page **pages, unsigned long new_addr)
{
struct vm_area_struct *vma;
struct page *page;
struct mmu_notifier_range range;
pmd_t *pmd;
pte_t *pte, old_pte;
spinlock_t *ptl;
unsigned long addr, addr_end;
bool pages_dirty = false;
int i, err;
addr_end = addr_start + len;
lru_add_drain();
addr = addr_start;
i = 0;
while (addr < addr_end) {
page = pages[i];
vma = find_vma(mm, addr);
if (!vma) {
WARN_ON("find_vma failed\n");
return -EINVAL;
}
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma,
vma->vm_mm, addr_start, addr_start + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
pmd = mm_find_pmd(mm, addr);
if (!pmd) {
mmu_notifier_invalidate_range_end(&range);
WARN_ON("mm_find_pmd failed, addr:%llx\n");
return -ENXIO;
}
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
flush_cache_page(vma, addr, pte_pfn(*pte));
old_pte = ptep_clear_flush(vma, addr, pte);
if (pte_dirty(old_pte) || PageDirty(page))
pages_dirty = true;
set_pte(pte, swp_entry_to_pte(swp_entry(SWP_USERSWAP_ENTRY, page_to_pfn(page))));
dec_mm_counter(mm, MM_ANONPAGES);
page_remove_rmap(page, false);
put_page(page);
pte_unmap_unlock(pte, ptl);
mmu_notifier_invalidate_range_end(&range);
vma->vm_flags |= VM_USWAP;
page->mapping = NULL;
addr += PAGE_SIZE;
i++;
}
addr_start = new_addr;
addr_end = new_addr + len;
addr = addr_start;
vma = find_vma(mm, addr);
i = 0;
while (addr < addr_end) {
page = pages[i];
if (addr > vma->vm_end - 1)
vma = find_vma(mm, addr);
err = vm_insert_page(vma, addr, page);
if (err) {
pr_err("vm_insert_page failed:%d\n", err);
}
i++;
addr += PAGE_SIZE;
}
vma->vm_flags |= VM_USWAP;
if (pages_dirty)
new_addr = new_addr | USWAP_PAGES_DIRTY;
return new_addr;
}
#endif
/* /*
* The caller must write-lock current->mm->mmap_lock. * The caller must write-lock current->mm->mmap_lock.
*/ */
...@@ -1409,6 +1575,12 @@ unsigned long do_mmap(struct file *file, unsigned long addr, ...@@ -1409,6 +1575,12 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
vm_flags_t vm_flags; vm_flags_t vm_flags;
int pkey = 0; int pkey = 0;
#ifdef CONFIG_USERSWAP
struct page **pages = NULL;
unsigned long addr_start = addr;
int i, page_num = 0;
unsigned long ret;
#endif
*populate = 0; *populate = 0;
...@@ -1425,6 +1597,17 @@ unsigned long do_mmap(struct file *file, unsigned long addr, ...@@ -1425,6 +1597,17 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (!(file && path_noexec(&file->f_path))) if (!(file && path_noexec(&file->f_path)))
prot |= PROT_EXEC; prot |= PROT_EXEC;
#ifdef CONFIG_USERSWAP
if (flags & MAP_REPLACE) {
if (offset_in_page(addr) || (len % PAGE_SIZE))
return -EINVAL;
page_num = len / PAGE_SIZE;
ret = pages_can_be_swapped(mm, addr, len, &pages);
if (ret)
return ret;
}
#endif
/* force arch specific MAP_FIXED handling in get_unmapped_area */ /* force arch specific MAP_FIXED handling in get_unmapped_area */
if (flags & MAP_FIXED_NOREPLACE) if (flags & MAP_FIXED_NOREPLACE)
flags |= MAP_FIXED; flags |= MAP_FIXED;
...@@ -1580,12 +1763,38 @@ unsigned long do_mmap(struct file *file, unsigned long addr, ...@@ -1580,12 +1763,38 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags |= VM_NORESERVE; vm_flags |= VM_NORESERVE;
} }
#ifdef CONFIG_USERSWAP
/* mark the vma as special to avoid merging with other vmas */
if (flags & MAP_REPLACE)
vm_flags |= VM_SPECIAL;
#endif
addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
if (!IS_ERR_VALUE(addr) && if (!IS_ERR_VALUE(addr) &&
((vm_flags & VM_LOCKED) || ((vm_flags & VM_LOCKED) ||
(flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
*populate = len; *populate = len;
#ifndef CONFIG_USERSWAP
return addr; return addr;
#else
if (!(flags & MAP_REPLACE))
return addr;
if (IS_ERR_VALUE(addr)) {
pr_info("mmap_region failed, return addr:%lx\n", addr);
ret = addr;
goto out;
}
ret = do_user_swap(mm, addr_start, len, pages, addr);
out:
/* follow_page() above increased the reference*/
for (i = 0; i < page_num; i++)
put_page(pages[i]);
if (pages)
kfree(pages);
return ret;
#endif
} }
unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册