未验证 提交 22ff6706 编写于 作者: O openeuler-ci-bot 提交者: Gitee

!786 Support userswap feature

Merge Pull Request from: @anred 
 
This patch series optimizes userswap mainly including swap-in and
swap-out.

We tested the concurrent scenario of multi-threaded page fault and
multi-threaded swap-in in the uswap demo;and the remapping in the
swap-out phase and the copy-free function in the swap-in phase were ok.
During the test, related debugging functions including CONFIG_DEBUG_VM,
lockdep, slub debug, kasan and kmemleak are enabled. 
 
Link:https://gitee.com/openeuler/kernel/pulls/786 

Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com> 
Reviewed-by: Jialin Zhang <zhangjialin11@huawei.com> 
Signed-off-by: Jialin Zhang <zhangjialin11@huawei.com> 
......@@ -27,13 +27,11 @@
#include <linux/ioctl.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/userswap.h>
int sysctl_unprivileged_userfaultfd __read_mostly = 1;
static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
#ifdef CONFIG_USERSWAP
int enable_userswap;
#endif
/*
* Start with fault_pending_wqh and fault_wqh so they're more likely
......@@ -220,6 +218,9 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
if (features & UFFD_FEATURE_THREAD_ID)
msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
#ifdef CONFIG_USERSWAP
uswap_get_cpu_id(reason, &msg);
#endif
return msg;
}
......@@ -334,8 +335,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
* changes under us.
*/
#ifdef CONFIG_USERSWAP
if ((reason & VM_USWAP) && (!pte_present(*pte)))
ret = true;
uswap_must_wait(reason, *pte, &ret);
#endif
if (pte_none(*pte))
ret = true;
......@@ -408,8 +408,12 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
BUG_ON(ctx->mm != mm);
#ifdef CONFIG_USERSWAP
VM_BUG_ON(uswap_vm_flag_bug_on(reason));
#else
VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
#endif
if (ctx->features & UFFD_FEATURE_SIGBUS)
goto out;
......@@ -483,6 +487,10 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
uwq.wq.private = current;
uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
ctx->features);
#ifdef CONFIG_USERSWAP
if (reason & VM_USWAP && pte_none(vmf->orig_pte))
uwq.msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_FPF;
#endif
uwq.ctx = ctx;
uwq.waken = false;
......@@ -866,8 +874,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
for (vma = mm->mmap; vma; vma = vma->vm_next) {
userfault_flags = VM_UFFD_MISSING | VM_UFFD_WP;
#ifdef CONFIG_USERSWAP
if (enable_userswap)
userfault_flags |= VM_USWAP;
uswap_release(&userfault_flags);
#endif
cond_resched();
BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
......@@ -1275,6 +1282,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
bool found;
bool basic_ioctls;
unsigned long start, end, vma_end;
#ifdef CONFIG_USERSWAP
bool uswap_mode = false;
#endif
user_uffdio_register = (struct uffdio_register __user *) arg;
......@@ -1288,26 +1298,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
goto out;
vm_flags = 0;
#ifdef CONFIG_USERSWAP
/*
* register the whole vma overlapping with the address range to avoid
* splitting the vma.
*/
if (enable_userswap && (uffdio_register.mode & UFFDIO_REGISTER_MODE_USWAP)) {
uffdio_register.mode &= ~UFFDIO_REGISTER_MODE_USWAP;
if (!uffdio_register.mode)
goto out;
vm_flags |= VM_USWAP;
end = uffdio_register.range.start + uffdio_register.range.len - 1;
vma = find_vma(mm, uffdio_register.range.start);
if (!vma)
goto out;
uffdio_register.range.start = vma->vm_start;
vma = find_vma(mm, end);
if (!vma)
goto out;
uffdio_register.range.len = vma->vm_end - uffdio_register.range.start;
}
if (!uswap_register(&uffdio_register, &uswap_mode))
goto out;
#endif
if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING|
UFFDIO_REGISTER_MODE_WP))
......@@ -1321,7 +1313,13 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
uffdio_register.range.len);
if (ret)
goto out;
#ifdef CONFIG_USERSWAP
if (uswap_mode && !uswap_adjust_uffd_range(&uffdio_register,
&vm_flags, mm)) {
ret = -EINVAL;
goto out;
}
#endif
start = uffdio_register.range.start;
end = start + uffdio_register.range.len;
......@@ -1717,7 +1715,10 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
ret = -EINVAL;
if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
goto out;
if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE |
UFFDIO_COPY_MODE_WP |
IS_ENABLED(CONFIG_USERSWAP) ?
UFFDIO_COPY_MODE_DIRECT_MAP : 0))
goto out;
if (mmget_not_zero(ctx->mm)) {
ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
......@@ -2029,15 +2030,6 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
return fd;
}
#ifdef CONFIG_USERSWAP
static int __init enable_userswap_setup(char *str)
{
enable_userswap = true;
return 1;
}
__setup("enable_userswap", enable_userswap_setup);
#endif
static int __init userfaultfd_init(void)
{
userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
......
......@@ -31,9 +31,6 @@
#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
extern int sysctl_unprivileged_userfaultfd;
#ifdef CONFIG_USERSWAP
extern int enable_userswap;
#endif
extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
......
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Copyright (C) Huawei Technologies Co., Ltd. 2023. All rights reserved.
*/
#ifndef _LINUX_USERSWAP_H
#define _LINUX_USERSWAP_H
#include <linux/mman.h>
#include <linux/userfaultfd.h>
#ifdef CONFIG_USERSWAP
extern struct static_key_false userswap_enabled;
/*
* In uswap situation, we use the bit 0 of the returned address to indicate
* whether the pages are dirty.
*/
#define USWAP_PAGES_DIRTY 1
int mfill_atomic_pte_nocopy(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr);
unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len,
unsigned long new_addr, unsigned long new_len);
bool uswap_register(struct uffdio_register *uffdio_register, bool *uswap_mode);
bool uswap_adjust_uffd_range(struct uffdio_register *uffdio_register,
unsigned long *vm_flags, struct mm_struct *mm);
bool do_uswap_page(swp_entry_t entry, struct vm_fault *vmf,
struct vm_area_struct *vma, vm_fault_t *ret);
static inline bool uswap_check_copy(struct vm_area_struct *vma,
unsigned long src_addr,
unsigned long len, __u64 mode)
{
if (vma->vm_flags & VM_USWAP) {
if (!(mode & UFFDIO_COPY_MODE_DIRECT_MAP))
return false;
if (offset_in_page(src_addr))
return false;
if (src_addr > TASK_SIZE || src_addr > TASK_SIZE - len)
return false;
} else {
if (mode & UFFDIO_COPY_MODE_DIRECT_MAP)
return false;
}
return true;
}
static inline bool uswap_validate_mremap_flags(unsigned long flags)
{
if (static_branch_unlikely(&userswap_enabled)) {
if (flags & MREMAP_USWAP_SET_PTE &&
flags & ~MREMAP_USWAP_SET_PTE)
return false;
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE |
MREMAP_DONTUNMAP | MREMAP_USWAP_SET_PTE))
return false;
} else {
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE |
MREMAP_DONTUNMAP))
return false;
}
return true;
}
/* When CONFIG_USERSWAP=y, VM_UFFD_MISSING|VM_USWAP is right;
* 0 or > 1 flags set is a bug; we expect exactly 1.
*/
static inline bool uswap_vm_flag_bug_on(unsigned long reason)
{
if (reason & ~(VM_UFFD_MISSING | VM_UFFD_WP | VM_USWAP))
return true;
if (reason & VM_USWAP)
return !(reason & VM_UFFD_MISSING) ||
reason & ~(VM_USWAP|VM_UFFD_MISSING);
return !(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP);
}
static inline bool uswap_missing(struct vm_area_struct *vma)
{
if (vma->vm_flags & VM_USWAP && vma->vm_flags & VM_UFFD_MISSING)
return true;
return false;
}
static inline void uswap_get_cpu_id(unsigned long reason, struct uffd_msg *msg)
{
if (reason & VM_USWAP)
msg->reserved3 = smp_processor_id();
}
static inline void uswap_release(unsigned long *userfault_flags)
{
if (static_branch_unlikely(&userswap_enabled))
*userfault_flags |= VM_USWAP;
}
static inline void uswap_must_wait(unsigned long reason, pte_t pte, bool *ret)
{
if ((reason & VM_USWAP) && (!pte_present(pte)))
*ret = true;
}
#endif /* CONFIG_USERSWAP */
#endif /* _LINUX_USERSWAP_H */
......@@ -30,8 +30,6 @@
#define MAP_SYNC 0x080000 /* perform synchronous page faults for the mapping */
#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */
#define MAP_REPLACE 0x1000000
#define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be
* uninitialized */
......
......@@ -8,6 +8,7 @@
#define MREMAP_MAYMOVE 1
#define MREMAP_FIXED 2
#define MREMAP_DONTUNMAP 4
#define MREMAP_USWAP_SET_PTE 64
#define OVERCOMMIT_GUESS 0
#define OVERCOMMIT_ALWAYS 1
......
......@@ -127,6 +127,7 @@ struct uffd_msg {
/* flags for UFFD_EVENT_PAGEFAULT */
#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */
#define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */
#define UFFD_PAGEFAULT_FLAG_FPF (1<<10) /* If this was the first page fault */
struct uffdio_api {
/* userland asks for an API number and the features to enable */
......@@ -217,6 +218,7 @@ struct uffdio_copy {
* according to the uffdio_register.ioctls.
*/
#define UFFDIO_COPY_MODE_WP ((__u64)1<<1)
#define UFFDIO_COPY_MODE_DIRECT_MAP ((__u64)1<<10)
__u64 mode;
/*
......
......@@ -113,6 +113,7 @@ obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_USERSWAP) += userswap.o
obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
......
......@@ -73,6 +73,7 @@
#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/userswap.h>
#include <trace/events/kmem.h>
......@@ -3395,22 +3396,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
entry = pte_to_swp_entry(vmf->orig_pte);
#ifdef CONFIG_USERSWAP
if (swp_type(entry) == SWP_USERSWAP_ENTRY) {
/* print error if we come across a nested fault */
if (!strncmp(current->comm, "uswap", 5)) {
pr_err("USWAP: fault %lx is triggered by %s\n",
vmf->address, current->comm);
return VM_FAULT_SIGBUS;
}
if (!(vma->vm_flags & VM_UFFD_MISSING)) {
pr_err("USWAP: addr %lx flags %lx is not a user swap page",
vmf->address, vma->vm_flags);
goto skip_uswap;
}
ret = handle_userfault(vmf, VM_UFFD_MISSING | VM_USWAP);
if (!do_uswap_page(entry, vmf, vma, &ret))
return ret;
}
skip_uswap:
#endif
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
......@@ -3689,6 +3676,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
if (ret)
goto unlock;
/* Deliver the page fault to userland, check inside PT lock */
#ifdef CONFIG_USERSWAP
if (uswap_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return handle_userfault(vmf, VM_UFFD_MISSING|VM_USWAP);
}
#endif
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return handle_userfault(vmf, VM_UFFD_MISSING);
......@@ -3731,6 +3724,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
goto release;
/* Deliver the page fault to userland, check inside PT lock */
#ifdef CONFIG_USERSWAP
if (uswap_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
put_page(page);
return handle_userfault(vmf, VM_UFFD_MISSING | VM_USWAP);
}
#endif
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
put_page(page);
......
......@@ -1623,205 +1623,6 @@ __do_mmap(struct file *file, unsigned long addr, unsigned long len,
{
return __do_mmap_mm(current->mm, file, addr, len, prot, flags, vm_flags, pgoff, populate, uf);
}
#ifdef CONFIG_USERSWAP
/*
* Check if pages between 'addr ~ addr+len' can be user swapped. If so, get
* the reference of the pages and return the pages through input parameters
* 'ppages'.
*/
static int pages_can_be_swapped(struct mm_struct *mm, unsigned long addr,
unsigned long len, struct page ***ppages)
{
struct vm_area_struct *vma;
struct page *page = NULL;
struct page **pages = NULL;
unsigned long addr_end = addr + len;
unsigned long ret;
int i, page_num = 0;
pages = kmalloc(sizeof(struct page *) * (len / PAGE_SIZE), GFP_KERNEL);
if (!pages)
return -ENOMEM;
while (addr < addr_end) {
vma = find_vma(mm, addr);
if (!vma || !vma_is_anonymous(vma) || vma->vm_file ||
(vma->vm_flags & VM_LOCKED) || (vma->vm_flags & VM_STACK) ||
(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
ret = -EINVAL;
goto out;
}
if (!(vma->vm_flags & VM_UFFD_MISSING)) {
ret = -EAGAIN;
goto out;
}
get_again:
/* follow_page will inc page ref, dec the ref after we remap the page */
page = follow_page(vma, addr, FOLL_GET);
if (IS_ERR_OR_NULL(page)) {
ret = -ENODEV;
goto out;
}
pages[page_num++] = page;
if (!PageAnon(page) || !PageSwapBacked(page) ||
PageHuge(page) || PageSwapCache(page)) {
ret = -EINVAL;
goto out;
} else if (PageTransCompound(page)) {
if (trylock_page(page)) {
if (!split_huge_page(page)) {
put_page(page);
page_num--;
unlock_page(page);
goto get_again;
} else {
unlock_page(page);
ret = -EINVAL;
goto out;
}
} else {
ret = -EINVAL;
goto out;
}
}
if (page_mapcount(page) > 1 ||
page_mapcount(page) + 1 != page_count(page)) {
ret = -EBUSY;
goto out;
}
addr += PAGE_SIZE;
}
*ppages = pages;
return 0;
out:
for (i = 0; i < page_num; i++)
put_page(pages[i]);
if (pages)
kfree(pages);
*ppages = NULL;
return ret;
}
/*
* In uswap situation, we use the bit 0 of the returned address to indicate
* whether the pages are dirty.
*/
#define USWAP_PAGES_DIRTY 1
/* unmap the pages between 'addr ~ addr+len' and remap them to a new address */
static unsigned long
do_user_swap(struct mm_struct *mm, unsigned long addr_start, unsigned long len,
struct page **pages, unsigned long new_addr)
{
struct vm_area_struct *vma;
struct page *page;
struct mmu_notifier_range range;
pmd_t *pmd;
pte_t *pte, old_pte;
spinlock_t *ptl;
unsigned long addr;
bool pages_dirty = false;
int i = 0;
addr = addr_start;
lru_add_drain();
i = 0;
while (addr < addr_start + len) {
page = pages[i];
vma = find_vma(mm, addr);
if (!vma)
return -EINVAL;
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma,
vma->vm_mm, addr, addr + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
pmd = mm_find_pmd(mm, addr);
if (!pmd) {
mmu_notifier_invalidate_range_end(&range);
return -ENXIO;
}
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
flush_cache_page(vma, addr, pte_pfn(*pte));
old_pte = ptep_clear_flush(vma, addr, pte);
if (pte_dirty(old_pte) || PageDirty(page))
pages_dirty = true;
set_pte(pte, swp_entry_to_pte(swp_entry(SWP_USERSWAP_ENTRY,
page_to_pfn(page))));
dec_mm_counter(mm, MM_ANONPAGES);
reliable_page_counter(page, mm, -1);
page_remove_rmap(page, false);
put_page(page);
pte_unmap_unlock(pte, ptl);
mmu_notifier_invalidate_range_end(&range);
vma->vm_flags |= VM_USWAP;
page->mapping = NULL;
addr += PAGE_SIZE;
i++;
}
addr = new_addr;
vma = find_vma(mm, addr);
i = 0;
while (addr < new_addr + len) {
if (addr > vma->vm_end - 1)
vma = find_vma(mm, addr);
if (!vma)
return -ENODEV;
page = pages[i++];
if (vm_insert_page(vma, addr, page))
return -EFAULT;
addr += PAGE_SIZE;
}
vma->vm_flags |= VM_USWAP;
if (pages_dirty)
new_addr = new_addr | USWAP_PAGES_DIRTY;
return new_addr;
}
static inline unsigned long
do_uswap_mmap(struct file *file, unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags, unsigned long pgoff,
unsigned long *populate, struct list_head *uf)
{
struct mm_struct *mm = current->mm;
unsigned long old_addr = addr;
struct page **pages = NULL;
unsigned long ret;
int i;
if (!len || offset_in_page(addr) || (len % PAGE_SIZE))
return -EINVAL;
ret = pages_can_be_swapped(mm, addr, len, &pages);
if (ret)
return ret;
/* mark the vma as special to avoid merging with other vmas */
addr = __do_mmap(file, addr, len, prot, flags, VM_SPECIAL, pgoff,
populate, uf);
if (IS_ERR_VALUE(addr)) {
ret = addr;
goto out;
}
ret = do_user_swap(mm, old_addr, len, pages, addr);
out:
/* follow_page() above increased the reference*/
for (i = 0; i < len / PAGE_SIZE; i++)
put_page(pages[i]);
if (pages)
kfree(pages);
return ret;
}
#endif
/*
* The caller must write-lock current->mm->mmap_lock.
......@@ -1831,11 +1632,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long flags, unsigned long pgoff,
unsigned long *populate, struct list_head *uf)
{
#ifdef CONFIG_USERSWAP
if (enable_userswap && (flags & MAP_REPLACE))
return do_uswap_mmap(file, addr, len, prot, flags, pgoff,
populate, uf);
#endif
return __do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, uf);
}
......
......@@ -25,6 +25,7 @@
#include <linux/mm-arch-hooks.h>
#include <linux/userfaultfd_k.h>
#include <linux/share_pool.h>
#include <linux/userswap.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
......@@ -915,8 +916,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
*/
addr = untagged_addr(addr);
#ifdef CONFIG_USERSWAP
if (!uswap_validate_mremap_flags(flags))
return ret;
#else
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
return ret;
#endif
if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
return ret;
......@@ -947,6 +953,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (!new_len)
return ret;
#ifdef CONFIG_USERSWAP
if (flags & MREMAP_USWAP_SET_PTE)
return uswap_mremap(addr, old_len, new_addr, new_len);
#endif
if (mmap_write_lock_killable(current->mm))
return -EINTR;
......
......@@ -15,6 +15,7 @@
#include <linux/mmu_notifier.h>
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <linux/userswap.h>
#include <asm/tlbflush.h>
#include "internal.h"
......@@ -90,10 +91,6 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
*pagep = NULL;
}
#ifdef CONFIG_USERSWAP
if (dst_vma->vm_flags & VM_USWAP)
ClearPageDirty(page);
#endif
/*
* The memory barrier inside __SetPageUptodate makes sure that
* preceding stores to the page contents become visible before
......@@ -112,10 +109,6 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
else
_dst_pte = pte_mkwrite(_dst_pte);
}
#ifdef CONFIG_USERSWAP
if (dst_vma->vm_flags & VM_USWAP)
_dst_pte = pte_mkclean(_dst_pte);
#endif
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
if (dst_vma->vm_file) {
......@@ -128,26 +121,9 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
goto out_release_uncharge_unlock;
}
#ifdef CONFIG_USERSWAP
if (!(dst_vma->vm_flags & VM_USWAP)) {
ret = -EEXIST;
if (!pte_none(*dst_pte))
goto out_release_uncharge_unlock;
} else {
/*
* The userspace may swap in a large area. Part of the area is
* not swapped out. Skip those pages.
*/
ret = 0;
if (swp_type(pte_to_swp_entry(*dst_pte)) != SWP_USERSWAP_ENTRY ||
pte_present(*dst_pte))
goto out_release_uncharge_unlock;
}
#else
ret = -EEXIST;
if (!pte_none(*dst_pte))
goto out_release_uncharge_unlock;
#endif
inc_mm_counter(dst_mm, MM_ANONPAGES);
reliable_page_counter(page, dst_mm, 1);
......@@ -535,6 +511,10 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
goto out_unlock;
err = -EINVAL;
#ifdef CONFIG_USERSWAP
if (!uswap_check_copy(dst_vma, src_addr, len, mode))
goto out_unlock;
#endif
/*
* shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
* it will overwrite vm_ops, so vma_is_anonymous must return false.
......@@ -605,8 +585,17 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
BUG_ON(pmd_none(*dst_pmd));
BUG_ON(pmd_trans_huge(*dst_pmd));
err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
src_addr, &page, zeropage, wp_copy);
#ifdef CONFIG_USERSWAP
if (static_branch_unlikely(&userswap_enabled) &&
dst_vma->vm_flags & VM_USWAP &&
mode & UFFDIO_COPY_MODE_DIRECT_MAP)
err = mfill_atomic_pte_nocopy(dst_mm, dst_pmd, dst_vma,
dst_addr, src_addr);
else
#endif
err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
dst_addr, src_addr, &page,
zeropage, wp_copy);
cond_resched();
if (unlikely(err == -ENOENT)) {
......
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) Huawei Technologies Co., Ltd. 2023. All rights reserved.
*
* userswap core file include swap-in and swap-out core function
*/
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
#include <linux/userswap.h>
#include <linux/userfaultfd_k.h>
#include <linux/security.h>
#include "internal.h"
DEFINE_STATIC_KEY_FALSE(userswap_enabled);
static bool vma_uswap_compatible(struct vm_area_struct *vma)
{
if (!vma || !(vma->vm_flags & VM_USWAP) || !vma_is_anonymous(vma) ||
vma->vm_file || vma->vm_flags & (VM_SHARED | VM_LOCKED | VM_STACK |
VM_IO | VM_PFNMAP))
return false;
return true;
}
static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pgd = pgd_offset(mm, addr);
if (pgd_none_or_clear_bad(pgd))
return NULL;
p4d = p4d_offset(pgd, addr);
if (p4d_none_or_clear_bad(p4d))
return NULL;
pud = pud_offset(p4d, addr);
if (pud_none_or_clear_bad(pud))
return NULL;
return pud;
}
static bool is_thp_or_huge(struct mm_struct *mm, unsigned long addr)
{
pud_t *pud;
pmd_t *pmd;
pud = get_old_pud(mm, addr);
if (!pud)
return false;
else if (pud_huge(*pud))
return true;
pmd = pmd_offset(pud, addr);
if (!pmd)
return false;
else if (pmd_huge(*pmd) || pmd_trans_huge(*pmd))
return true;
return false;
}
/*
* Check if pages between 'addr ~ addr+len' can be user swapped. If so, get
* the reference of the pages and return the pages through input parameters
* 'ppages'.
*/
static unsigned long pages_can_be_swapped(struct mm_struct *mm,
unsigned long addr,
unsigned long len,
struct page ***ppages)
{
struct vm_area_struct *vma;
struct page *page = NULL;
struct page **pages = NULL;
unsigned long addr_end = addr + len;
unsigned long ret;
unsigned long i, page_num = 0;
*ppages = NULL;
pages = kmalloc(sizeof(struct page *) * (len / PAGE_SIZE), GFP_KERNEL);
if (!pages)
return -ENOMEM;
while (addr < addr_end) {
vma = find_vma(mm, addr);
if (!vma || addr < vma->vm_start ||
!vma_uswap_compatible(vma)) {
ret = -EINVAL;
goto out_err;
}
if (!(vma->vm_flags & VM_UFFD_MISSING)) {
ret = -EAGAIN;
goto out_err;
}
get_again:
/*
* follow_page will inc page ref, dec the ref after we remap
* the page.
*/
page = follow_page(vma, addr, FOLL_GET);
if (IS_ERR_OR_NULL(page)) {
ret = -ENODEV;
goto out_err;
}
pages[page_num++] = page;
if (!PageAnon(page) || !PageSwapBacked(page) ||
PageHuge(page) || PageSwapCache(page)) {
ret = -EINVAL;
goto out_err;
}
if (PageTransCompound(page)) {
if (trylock_page(page)) {
if (!split_huge_page(page)) {
unlock_page(page);
put_page(page);
page_num--;
goto get_again;
} else
unlock_page(page);
}
ret = -EINVAL;
goto out_err;
}
/*
* Check that no O_DIRECT or similar I/O is in progress on the
* page
*/
if (page_mapcount(page) > 1) {
ret = -EBUSY;
goto out_err;
}
addr += PAGE_SIZE;
}
*ppages = pages;
return 0;
out_err:
for (i = 0; i < page_num; i++)
put_page(pages[i]);
kfree(pages);
return ret;
}
static void uswap_unmap_anon_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long addr, struct page *page,
pmd_t *pmd, pte_t *old_pte,
bool set_to_swp)
{
struct mmu_notifier_range range;
spinlock_t *ptl;
pte_t *pte, _old_pte;
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma,
vma->vm_mm, addr, addr + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
if (pte_none(*pte))
goto out_release_unlock;
flush_cache_page(vma, addr, pte_pfn(*pte));
_old_pte = ptep_clear_flush(vma, addr, pte);
if (set_to_swp)
set_pte_at(mm, addr, pte, swp_entry_to_pte(swp_entry(
SWP_USERSWAP_ENTRY, page_to_pfn(page))));
dec_mm_counter(mm, MM_ANONPAGES);
reliable_page_counter(page, mm, -1);
page_remove_rmap(page, false);
out_release_unlock:
pte_unmap_unlock(pte, ptl);
mmu_notifier_invalidate_range_end(&range);
page->mapping = NULL;
if (old_pte)
*old_pte = _old_pte;
}
static void uswap_map_anon_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long addr,
struct page *page,
pmd_t *pmd,
pte_t old_pte)
{
spinlock_t *ptl;
pte_t *pte;
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
flush_cache_page(vma, addr, pte_pfn(*pte));
set_pte_at(mm, addr, pte, old_pte);
inc_mm_counter(mm, MM_ANONPAGES);
reliable_page_counter(page, mm, 1);
page_add_new_anon_rmap(page, vma, addr, false);
pte_unmap_unlock(pte, ptl);
}
static unsigned long vm_insert_anon_page(struct vm_area_struct *vma,
unsigned long addr, struct page *page)
{
struct mm_struct *mm = vma->vm_mm;
int ret = 0;
pte_t *pte;
spinlock_t *ptl;
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
flush_dcache_page(page);
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
return -ENOMEM;
if (!pte_none(*pte)) {
ret = -EBUSY;
goto out_unlock;
}
inc_mm_counter(mm, MM_ANONPAGES);
reliable_page_counter(page, mm, 1);
page_add_new_anon_rmap(page, vma, addr, false);
set_pte_at(mm, addr, pte, mk_pte(page, vma->vm_page_prot));
out_unlock:
pte_unmap_unlock(pte, ptl);
return ret;
}
static void uswapout_recover(struct mm_struct *mm,
unsigned long old_addr_start, unsigned long len,
struct page **pages, unsigned long new_addr_start,
pte_t *ptes)
{
unsigned long unmap_old_addr = old_addr_start;
unsigned long unmap_new_addr = new_addr_start;
struct page *page;
pmd_t *old_pmd, *new_pmd;
pte_t pte;
unsigned long i;
for (i = 0; i < len; i++) {
page = pages[i];
pte = ptes[i];
new_pmd = mm_find_pmd(mm, new_addr_start);
old_pmd = mm_find_pmd(mm, unmap_old_addr);
uswap_unmap_anon_page(mm, find_vma(mm, unmap_new_addr),
unmap_new_addr, page, new_pmd, NULL,
false);
uswap_map_anon_page(mm, find_vma(mm, unmap_old_addr),
unmap_old_addr, page, old_pmd, pte);
unmap_old_addr += PAGE_SIZE;
unmap_new_addr += PAGE_SIZE;
}
if (pte_val(ptes[len]) != 0) {
page = pages[len];
pte = ptes[len];
old_pmd = mm_find_pmd(mm, unmap_old_addr);
uswap_map_anon_page(mm, find_vma(mm, unmap_old_addr),
unmap_old_addr, page, old_pmd, pte);
get_page(page);
}
}
/* unmap the pages between 'addr ~ addr+len' and remap them to a new address */
static unsigned long do_user_swap(struct mm_struct *mm,
unsigned long old_addr_start,
unsigned long len, struct page **pages,
unsigned long new_addr_start)
{
struct vm_area_struct *old_vma, *new_vma;
unsigned long old_addr = old_addr_start;
unsigned long new_addr = new_addr_start;
struct page *page;
pmd_t *pmd;
pte_t old_pte, *ptes;
bool pages_dirty = false;
unsigned long i = 0, j;
int ret;
ptes = kmalloc(sizeof(pte_t) * (len / PAGE_SIZE), GFP_KERNEL);
if (!ptes)
return -ENOMEM;
memset(ptes, 0, sizeof(pte_t) * (len / PAGE_SIZE));
lru_add_drain();
for (j = 0; j < len; j += PAGE_SIZE) {
page = pages[i];
ret = -EINVAL;
if (!page)
goto out_recover;
if (is_thp_or_huge(mm, new_addr))
goto out_recover;
old_vma = find_vma(mm, old_addr);
if (!old_vma || old_addr < old_vma->vm_start)
goto out_recover;
new_vma = find_vma(mm, new_addr);
if (!new_vma || new_addr < new_vma->vm_start)
goto out_recover;
ret = -EACCES;
if (!(old_vma->vm_flags & VM_WRITE) &&
(new_vma->vm_flags & VM_WRITE))
goto out_recover;
ret = -ENXIO;
pmd = mm_find_pmd(mm, old_addr);
if (!pmd)
goto out_recover;
uswap_unmap_anon_page(mm, old_vma, old_addr, page, pmd,
&old_pte, true);
ptes[i] = old_pte;
if (pte_dirty(old_pte) || PageDirty(page))
pages_dirty = true;
put_page(page);
ret = vm_insert_anon_page(new_vma, new_addr, page);
if (ret)
goto out_recover;
get_page(page);
old_addr += PAGE_SIZE;
new_addr += PAGE_SIZE;
i++;
}
if (pages_dirty)
new_addr_start = new_addr_start | USWAP_PAGES_DIRTY;
kfree(ptes);
return new_addr_start;
out_recover:
uswapout_recover(mm, old_addr_start, i, pages, new_addr_start, ptes);
kfree(ptes);
return ret;
}
/*
* When flags is MREMAP_USWAP_SET_PTE, uswap_mremap() is called in syscall
* mremap.
* Unmap the pages between 'addr ~addr+old_len' and remap them to 'new_addr
* ~ new_addr+new_len'. Set the pte of old_addr to SWP_USERSWAP_ENTRY.
*/
unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len,
unsigned long new_addr, unsigned long new_len)
{
struct page **pages = NULL;
struct mm_struct *mm = current->mm;
unsigned long len = old_len;
unsigned long ret = -EINVAL;
unsigned long i;
if (!len || old_len != new_len || offset_in_page(old_addr) ||
offset_in_page(new_addr) || (len % PAGE_SIZE))
return ret;
if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len ||
old_addr > TASK_SIZE - old_len)
return ret;
/* Ensure the old/new locations do not overlap */
if (old_addr + old_len > new_addr && new_addr + new_len > old_addr)
return ret;
down_read(&mm->mmap_lock);
ret = pages_can_be_swapped(mm, old_addr, len, &pages);
if (ret) {
up_read(&mm->mmap_lock);
return ret;
}
ret = do_user_swap(mm, old_addr, len, pages, new_addr);
up_read(&mm->mmap_lock);
/* follow_page() above increased the reference*/
for (i = 0; i < len / PAGE_SIZE; i++)
if (pages[i])
put_page(pages[i]);
kfree(pages);
return ret;
}
int mfill_atomic_pte_nocopy(struct mm_struct *mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr)
{
struct vm_area_struct *src_vma;
pte_t dst_pte, *pte, src_pte;
pmd_t *src_pmd;
spinlock_t *ptl;
int ret = 0;
struct page *page;
src_vma = find_vma(mm, src_addr);
if (!src_vma || src_addr < src_vma->vm_start)
return -ENOENT;
if (src_vma->vm_flags & VM_LOCKED)
return -EINVAL;
page = follow_page(src_vma, src_addr, FOLL_GET | FOLL_MIGRATION);
if (!page)
return -ENODEV;
src_pmd = mm_find_pmd(mm, src_addr);
if (!src_pmd) {
ret = -ENXIO;
goto out_put_page;
}
uswap_unmap_anon_page(mm, src_vma, src_addr, page, src_pmd, &src_pte,
false);
if (dst_vma->vm_flags & VM_USWAP)
ClearPageDirty(page);
/*
* The memory barrier inside __SetPageUptodate makes sure that
* preceding stores to the page contents become visible before
* the set_pte_at() write.
*/
__SetPageUptodate(page);
dst_pte = mk_pte(page, dst_vma->vm_page_prot);
if (dst_vma->vm_flags & VM_WRITE)
dst_pte = pte_mkwrite(pte_mkdirty(dst_pte));
if (dst_vma->vm_flags & VM_USWAP)
dst_pte = pte_mkclean(dst_pte);
pte = pte_offset_map_lock(mm, dst_pmd, dst_addr, &ptl);
/*
* The userspace may swap in a large area. Part of the area is not
* swapped out. If concurrent execution, PTE may be present. Skip those
* pages (pte_present).
* No other scenes should be handled except first pagefault (pte_none)
* and after userswap out (SWP_USERSWAP_ENTRY).
*/
if (pte_present(*pte) || (!pte_none(*pte) &&
swp_type(pte_to_swp_entry(*pte)) != SWP_USERSWAP_ENTRY)) {
pte_unmap_unlock(pte, ptl);
uswap_map_anon_page(mm, src_vma, src_addr, page, src_pmd,
src_pte);
ret = -EEXIST;
goto out_put_page;
}
inc_mm_counter(mm, MM_ANONPAGES);
reliable_page_counter(page, mm, 1);
page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
set_pte_at(mm, dst_addr, pte, dst_pte);
/* No need to invalidate - it was non-present before */
update_mmu_cache(dst_vma, dst_addr, pte);
pte_unmap_unlock(pte, ptl);
out_put_page:
put_page(page);
return ret;
}
bool uswap_register(struct uffdio_register *uffdio_register, bool *uswap_mode)
{
if (!static_branch_unlikely(&userswap_enabled))
return true;
if (!(uffdio_register->mode & UFFDIO_REGISTER_MODE_USWAP))
return true;
uffdio_register->mode &= ~UFFDIO_REGISTER_MODE_USWAP;
if (!uffdio_register->mode)
return false;
*uswap_mode = true;
return true;
}
/*
* register the whole vma overlapping with the address range to avoid splitting
* the vma which could reduce fragmentation.
*/
bool uswap_adjust_uffd_range(struct uffdio_register *uffdio_register,
unsigned long *vm_flags, struct mm_struct *mm)
{
struct vm_area_struct *vma;
unsigned long end;
bool ret = false;
if (!static_branch_unlikely(&userswap_enabled))
return true;
end = uffdio_register->range.start + uffdio_register->range.len - 1;
mmap_read_lock(mm);
vma = find_vma(mm, uffdio_register->range.start);
if (!vma || vma->vm_start >= end)
goto out_unlock;
uffdio_register->range.start = vma->vm_start;
vma = find_vma(mm, end);
if (vma && end >= vma->vm_start)
uffdio_register->range.len = vma->vm_end - uffdio_register->range.start;
*vm_flags |= VM_USWAP;
ret = true;
out_unlock:
mmap_read_unlock(mm);
return ret;
}
bool do_uswap_page(swp_entry_t entry, struct vm_fault *vmf,
struct vm_area_struct *vma, vm_fault_t *ret)
{
if (!static_branch_unlikely(&userswap_enabled))
return true;
if (swp_type(entry) != SWP_USERSWAP_ENTRY)
return true;
/* print error if we come across a nested fault */
if (!strncmp(current->comm, "uswap", 5)) {
pr_err("USWAP: fault %lx is triggered by %s\n", vmf->address,
current->comm);
*ret = VM_FAULT_SIGBUS;
return false;
}
if (!(vma->vm_flags & VM_UFFD_MISSING)) {
pr_err("USWAP: addr %lx flags %lx is not a user swap page",
vmf->address, vma->vm_flags);
return true;
}
*ret = handle_userfault(vmf, VM_UFFD_MISSING | VM_USWAP);
return false;
}
static int __init enable_userswap_setup(char *str)
{
static_branch_enable(&userswap_enabled);
return 1;
}
__setup("enable_userswap", enable_userswap_setup);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册