提交 e3452806 编写于 作者: G Guo Fan 提交者: Cheng Jian

userswap: add a new flag 'MAP_REPLACE' for mmap()

hulk inclusion
category: feature
bugzilla: 47439
CVE: NA

-------------------------------------------------

To make sure there are no other userspace threads access the memory
region we are swapping out, we need unmmap the memory region, map it
to a new address and use the new address to perform the swapout. We add
a new flag 'MAP_REPLACE' for mmap() to unmap the pages of the input
parameter 'VA' and remap them to a new tmpVA.
Signed-off-by: NGuo Fan <guofan5@huawei.com>
Signed-off-by: NXiongfeng Wang <wangxiongfeng2@huawei.com>
Reviewed-by: NKefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: NJing Xiangfeng <jingxiangfeng@huawei.com>
Signed-off-by: NYang Yingliang <yangyingliang@huawei.com>
Signed-off-by: NCheng Jian <cj.chengjian@huawei.com>
上级 8bc9bb27
......@@ -665,6 +665,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_PKEY_BIT4)] = "",
#endif
#endif /* CONFIG_ARCH_HAS_PKEYS */
#ifdef CONFIG_USERSWAP
[ilog2(VM_USWAP)] = "us",
#endif
};
size_t i;
......
......@@ -236,6 +236,11 @@ extern unsigned int kobjsize(const void *objp);
#define VM_CHECKNODE 0x200000000
#ifdef CONFIG_USERSWAP
/* bit[32:36] is the protection key of intel, so use a large value for VM_USWAP */
#define VM_USWAP 0x2000000000000000
#endif
#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
#define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */
......
......@@ -52,6 +52,16 @@ static inline int current_is_kswapd(void)
* actions on faults.
*/
/*
* Userswap entry type
*/
#ifdef CONFIG_USERSWAP
#define SWP_USERSWAP_NUM 1
#define SWP_USERSWAP_ENTRY (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+SWP_DEVICE_NUM)
#else
#define SWP_USERSWAP_NUM 0
#endif
/*
* Unaddressable device memory support. See include/linux/hmm.h and
* Documentation/vm/hmm.rst. Short description is we need struct pages for
......@@ -92,7 +102,7 @@ static inline int current_is_kswapd(void)
#define MAX_SWAPFILES \
((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - SWP_USERSWAP_NUM)
/*
* Magic header for a swap area. The first part of the union is
......
......@@ -130,6 +130,12 @@ IF_HAVE_PG_IDLE(PG_idle, "idle"), \
#define IF_HAVE_VM_SOFTDIRTY(flag,name)
#endif
#ifdef CONFIG_USERSWAP
#define IF_HAVE_VM_USWAP(flag,name) {flag, name },
#else
#define IF_HAVE_VM_USWAP(flag,name)
#endif
#define __def_vmaflag_names \
{VM_READ, "read" }, \
{VM_WRITE, "write" }, \
......@@ -161,6 +167,7 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \
{VM_MIXEDMAP, "mixedmap" }, \
{VM_HUGEPAGE, "hugepage" }, \
{VM_NOHUGEPAGE, "nohugepage" }, \
IF_HAVE_VM_USWAP(VM_USWAP, "userswap" ) \
{VM_MERGEABLE, "mergeable" } \
#define show_vma_flags(flags) \
......
......@@ -17,6 +17,10 @@
#define MAP_SYNC 0x80000 /* perform synchronous page faults for the mapping */
#define MAP_PA32BIT 0x400000 /* physical address is within 4G */
#ifdef CONFIG_USERSWAP
#define MAP_REPLACE 0x1000000
#endif
/* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */
#define MCL_CURRENT 1 /* lock all current mappings */
......
......@@ -503,6 +503,15 @@ config SHRINK_PAGECACHE
if unsure, say N to disable the SHRINK_PAGECACHE.
config USERSWAP
bool "Enable User Swap"
depends on MMU && USERFAULTFD
depends on X86 || ARM64
default n
help
Support for User Swap. This is based on userfaultfd. We can implement
our own swapout and swapin functions in usersapce.
config CMA
bool "Contiguous Memory Allocator"
depends on HAVE_MEMBLOCK && MMU
......
......@@ -46,6 +46,7 @@
#include <linux/pkeys.h>
#include <linux/oom.h>
#include <linux/sched/mm.h>
#include <linux/swapops.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
......@@ -1372,6 +1373,169 @@ int unregister_mmap_notifier(struct notifier_block *nb)
EXPORT_SYMBOL_GPL(unregister_mmap_notifier);
#endif
#ifdef CONFIG_USERSWAP
/*
* Check if pages between 'addr ~ addr+len' can be user swapped. If so, get
* the reference of the pages and return the pages through input parameters
* 'ppages'.
*/
int pages_can_be_swapped(struct mm_struct *mm, unsigned long addr,
unsigned long len, struct page ***ppages)
{
struct vm_area_struct *vma;
struct page *page = NULL;
struct page **pages = NULL;
unsigned long addr_start, addr_end;
unsigned long ret;
int i, page_num = 0;
pages = kmalloc(sizeof(struct page *) * (len / PAGE_SIZE), GFP_KERNEL);
if (!pages)
return -ENOMEM;
addr_start = addr;
addr_end = addr + len;
while (addr < addr_end) {
vma = find_vma(mm, addr);
if (!vma || !vma_is_anonymous(vma) ||
(vma->vm_flags & VM_LOCKED) || vma->vm_file
|| (vma->vm_flags & VM_STACK) || (vma->vm_flags & (VM_IO | VM_PFNMAP))) {
ret = -EINVAL;
goto out;
}
if (!(vma->vm_flags & VM_UFFD_MISSING)) {
ret = -EAGAIN;
goto out;
}
get_again:
/* follow_page will inc page ref, dec the ref after we remap the page */
page = follow_page(vma, addr, FOLL_GET);
if (IS_ERR_OR_NULL(page)) {
ret = -ENODEV;
goto out;
}
pages[page_num] = page;
page_num++;
if (!PageAnon(page) || !PageSwapBacked(page) || PageHuge(page) || PageSwapCache(page)) {
ret = -EINVAL;
goto out;
} else if (PageTransCompound(page)) {
if (trylock_page(page)) {
if (!split_huge_page(page)) {
put_page(page);
page_num--;
unlock_page(page);
goto get_again;
} else {
unlock_page(page);
ret = -EINVAL;
goto out;
}
} else {
ret = -EINVAL;
goto out;
}
}
if (page_mapcount(page) > 1 || page_mapcount(page) + 1 != page_count(page)) {
ret = -EBUSY;
goto out;
}
addr += PAGE_SIZE;
}
*ppages = pages;
return 0;
out:
for (i = 0; i < page_num; i++)
put_page(pages[i]);
if (pages)
kfree(pages);
*ppages = NULL;
return ret;
}
/*
* In uswap situation, we use the bit 0 of the returned address to indicate
* whether the pages are dirty.
*/
#define USWAP_PAGES_DIRTY 1
/* unmap the pages between 'addr ~ addr+len' and remap them to a new address */
unsigned long do_user_swap(struct mm_struct *mm, unsigned long addr_start,
unsigned long len, struct page **pages, unsigned long new_addr)
{
struct vm_area_struct *vma;
struct page *page;
pmd_t *pmd;
pte_t *pte, old_pte;
spinlock_t *ptl;
unsigned long addr, addr_end;
bool pages_dirty = false;
int i, err;
addr_end = addr_start + len;
lru_add_drain();
mmu_notifier_invalidate_range_start(mm, addr_start, addr_end);
addr = addr_start;
i = 0;
while (addr < addr_end) {
page = pages[i];
vma = find_vma(mm, addr);
if (!vma) {
mmu_notifier_invalidate_range_end(mm, addr_start, addr_end);
WARN_ON("find_vma failed\n");
return -EINVAL;
}
pmd = mm_find_pmd(mm, addr);
if (!pmd) {
mmu_notifier_invalidate_range_end(mm, addr_start, addr_end);
WARN_ON("mm_find_pmd failed, addr:%llx\n");
return -ENXIO;
}
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
flush_cache_page(vma, addr, pte_pfn(*pte));
old_pte = ptep_clear_flush(vma, addr, pte);
if (pte_dirty(old_pte) || PageDirty(page))
pages_dirty = true;
set_pte(pte, swp_entry_to_pte(swp_entry(SWP_USERSWAP_ENTRY, page_to_pfn(page))));
dec_mm_counter(mm, MM_ANONPAGES);
page_remove_rmap(page, false);
put_page(page);
pte_unmap_unlock(pte, ptl);
vma->vm_flags |= VM_USWAP;
page->mapping = NULL;
addr += PAGE_SIZE;
i++;
}
mmu_notifier_invalidate_range_end(mm, addr_start, addr_end);
addr_start = new_addr;
addr_end = new_addr + len;
addr = addr_start;
vma = find_vma(mm, addr);
i = 0;
while (addr < addr_end) {
page = pages[i];
if (addr > vma->vm_end - 1)
vma = find_vma(mm, addr);
err = vm_insert_page(vma, addr, page);
if (err) {
pr_err("vm_insert_page failed:%d\n", err);
}
i++;
addr += PAGE_SIZE;
}
vma->vm_flags |= VM_USWAP;
if (pages_dirty)
new_addr = new_addr | USWAP_PAGES_DIRTY;
return new_addr;
}
#endif
/*
* The caller must hold down_write(&current->mm->mmap_sem).
*/
......@@ -1383,6 +1547,12 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
{
struct mm_struct *mm = current->mm;
int pkey = 0;
#ifdef CONFIG_USERSWAP
struct page **pages = NULL;
unsigned long addr_start = addr;
int i, page_num = 0;
unsigned long ret;
#endif
*populate = 0;
......@@ -1399,6 +1569,17 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (!(file && path_noexec(&file->f_path)))
prot |= PROT_EXEC;
#ifdef CONFIG_USERSWAP
if (flags & MAP_REPLACE) {
if (offset_in_page(addr) || (len % PAGE_SIZE))
return -EINVAL;
page_num = len / PAGE_SIZE;
ret = pages_can_be_swapped(mm, addr, len, &pages);
if (ret)
return ret;
}
#endif
/* force arch specific MAP_FIXED handling in get_unmapped_area */
if (flags & MAP_FIXED_NOREPLACE)
flags |= MAP_FIXED;
......@@ -1571,12 +1752,38 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (flags & MAP_CHECKNODE)
set_vm_checknode(&vm_flags, flags);
#ifdef CONFIG_USERSWAP
/* mark the vma as special to avoid merging with other vmas */
if (flags & MAP_REPLACE)
vm_flags |= VM_SPECIAL;
#endif
addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
if (!IS_ERR_VALUE(addr) &&
((vm_flags & VM_LOCKED) ||
(flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
*populate = len;
#ifndef CONFIG_USERSWAP
return addr;
#else
if (!(flags & MAP_REPLACE))
return addr;
if (IS_ERR_VALUE(addr)) {
pr_info("mmap_region failed, return addr:%lx\n", addr);
ret = addr;
goto out;
}
ret = do_user_swap(mm, addr_start, len, pages, addr);
out:
/* follow_page() above increased the reference*/
for (i = 0; i < page_num; i++)
put_page(pages[i]);
if (pages)
kfree(pages);
return ret;
#endif
}
unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册