diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 7d0022f82844ea8ba4cfac1ead8b67209bc01d84..c8ec0227f340095163320a0093c878122eebc79e 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -27,13 +27,11 @@ #include <linux/ioctl.h> #include <linux/security.h> #include <linux/hugetlb.h> +#include <linux/userswap.h> int sysctl_unprivileged_userfaultfd __read_mostly = 1; static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; -#ifdef CONFIG_USERSWAP -int enable_userswap; -#endif /* * Start with fault_pending_wqh and fault_wqh so they're more likely @@ -220,6 +218,9 @@ static inline struct uffd_msg userfault_msg(unsigned long address, msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; if (features & UFFD_FEATURE_THREAD_ID) msg.arg.pagefault.feat.ptid = task_pid_vnr(current); +#ifdef CONFIG_USERSWAP + uswap_get_cpu_id(reason, &msg); +#endif return msg; } @@ -334,8 +335,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, * changes under us. */ #ifdef CONFIG_USERSWAP - if ((reason & VM_USWAP) && (!pte_present(*pte))) - ret = true; + uswap_must_wait(reason, *pte, &ret); #endif if (pte_none(*pte)) ret = true; @@ -408,8 +408,12 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) BUG_ON(ctx->mm != mm); +#ifdef CONFIG_USERSWAP + VM_BUG_ON(uswap_vm_flag_bug_on(reason)); +#else VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP)); VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP)); +#endif if (ctx->features & UFFD_FEATURE_SIGBUS) goto out; @@ -483,6 +487,10 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) uwq.wq.private = current; uwq.msg = userfault_msg(vmf->address, vmf->flags, reason, ctx->features); +#ifdef CONFIG_USERSWAP + if (reason & VM_USWAP && pte_none(vmf->orig_pte)) + uwq.msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_FPF; +#endif uwq.ctx = ctx; uwq.waken = false; @@ -866,8 +874,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) for (vma = mm->mmap; vma; vma = vma->vm_next) { userfault_flags = VM_UFFD_MISSING | VM_UFFD_WP; #ifdef CONFIG_USERSWAP - if (enable_userswap) - userfault_flags |= VM_USWAP; + uswap_release(&userfault_flags); #endif cond_resched(); BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ @@ -1275,6 +1282,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, bool found; bool basic_ioctls; unsigned long start, end, vma_end; +#ifdef CONFIG_USERSWAP + bool uswap_mode = false; +#endif user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1288,26 +1298,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; vm_flags = 0; #ifdef CONFIG_USERSWAP - /* - * register the whole vma overlapping with the address range to avoid - * splitting the vma. - */ - if (enable_userswap && (uffdio_register.mode & UFFDIO_REGISTER_MODE_USWAP)) { - uffdio_register.mode &= ~UFFDIO_REGISTER_MODE_USWAP; - if (!uffdio_register.mode) - goto out; - vm_flags |= VM_USWAP; - end = uffdio_register.range.start + uffdio_register.range.len - 1; - vma = find_vma(mm, uffdio_register.range.start); - if (!vma) - goto out; - uffdio_register.range.start = vma->vm_start; - - vma = find_vma(mm, end); - if (!vma) - goto out; - uffdio_register.range.len = vma->vm_end - uffdio_register.range.start; - } + if (!uswap_register(&uffdio_register, &uswap_mode)) + goto out; #endif if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING| UFFDIO_REGISTER_MODE_WP)) @@ -1321,7 +1313,13 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, uffdio_register.range.len); if (ret) goto out; - +#ifdef CONFIG_USERSWAP + if (uswap_mode && !uswap_adjust_uffd_range(&uffdio_register, + &vm_flags, mm)) { + ret = -EINVAL; + goto out; + } +#endif start = uffdio_register.range.start; end = start + uffdio_register.range.len; @@ -1717,7 +1715,10 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, ret = -EINVAL; if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src) goto out; - if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) + if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE | + UFFDIO_COPY_MODE_WP | + IS_ENABLED(CONFIG_USERSWAP) ? + UFFDIO_COPY_MODE_DIRECT_MAP : 0)) goto out; if (mmget_not_zero(ctx->mm)) { ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, @@ -2029,15 +2030,6 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) return fd; } -#ifdef CONFIG_USERSWAP -static int __init enable_userswap_setup(char *str) -{ - enable_userswap = true; - return 1; -} -__setup("enable_userswap", enable_userswap_setup); -#endif - static int __init userfaultfd_init(void) { userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache", diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index e1cacab86bde2f7931c6c5fc105e5608d1e11e65..e91f31a4c830f89cf7b7b1a9e83fede93a612043 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -31,9 +31,6 @@ #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) extern int sysctl_unprivileged_userfaultfd; -#ifdef CONFIG_USERSWAP -extern int enable_userswap; -#endif extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); diff --git a/include/linux/userswap.h b/include/linux/userswap.h new file mode 100644 index 0000000000000000000000000000000000000000..43b419f9813fc2e6a61c93805ba303a87427fcd3 --- /dev/null +++ b/include/linux/userswap.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2023. All rights reserved. + */ + +#ifndef _LINUX_USERSWAP_H +#define _LINUX_USERSWAP_H + +#include <linux/mman.h> +#include <linux/userfaultfd.h> + +#ifdef CONFIG_USERSWAP + +extern struct static_key_false userswap_enabled; + +/* + * In uswap situation, we use the bit 0 of the returned address to indicate + * whether the pages are dirty. + */ +#define USWAP_PAGES_DIRTY 1 + +int mfill_atomic_pte_nocopy(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr); + +unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len, + unsigned long new_addr, unsigned long new_len); + +bool uswap_register(struct uffdio_register *uffdio_register, bool *uswap_mode); + +bool uswap_adjust_uffd_range(struct uffdio_register *uffdio_register, + unsigned long *vm_flags, struct mm_struct *mm); + +bool do_uswap_page(swp_entry_t entry, struct vm_fault *vmf, + struct vm_area_struct *vma, vm_fault_t *ret); + +static inline bool uswap_check_copy(struct vm_area_struct *vma, + unsigned long src_addr, + unsigned long len, __u64 mode) +{ + if (vma->vm_flags & VM_USWAP) { + if (!(mode & UFFDIO_COPY_MODE_DIRECT_MAP)) + return false; + if (offset_in_page(src_addr)) + return false; + if (src_addr > TASK_SIZE || src_addr > TASK_SIZE - len) + return false; + } else { + if (mode & UFFDIO_COPY_MODE_DIRECT_MAP) + return false; + } + + return true; +} + +static inline bool uswap_validate_mremap_flags(unsigned long flags) +{ + if (static_branch_unlikely(&userswap_enabled)) { + if (flags & MREMAP_USWAP_SET_PTE && + flags & ~MREMAP_USWAP_SET_PTE) + return false; + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | + MREMAP_DONTUNMAP | MREMAP_USWAP_SET_PTE)) + return false; + } else { + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | + MREMAP_DONTUNMAP)) + return false; + } + return true; +} + +/* When CONFIG_USERSWAP=y, VM_UFFD_MISSING|VM_USWAP is right; + * 0 or > 1 flags set is a bug; we expect exactly 1. + */ +static inline bool uswap_vm_flag_bug_on(unsigned long reason) +{ + if (reason & ~(VM_UFFD_MISSING | VM_UFFD_WP | VM_USWAP)) + return true; + if (reason & VM_USWAP) + return !(reason & VM_UFFD_MISSING) || + reason & ~(VM_USWAP|VM_UFFD_MISSING); + return !(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP); +} + +static inline bool uswap_missing(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_USWAP && vma->vm_flags & VM_UFFD_MISSING) + return true; + return false; +} + +static inline void uswap_get_cpu_id(unsigned long reason, struct uffd_msg *msg) +{ + if (reason & VM_USWAP) + msg->reserved3 = smp_processor_id(); +} + +static inline void uswap_release(unsigned long *userfault_flags) +{ + if (static_branch_unlikely(&userswap_enabled)) + *userfault_flags |= VM_USWAP; +} + +static inline void uswap_must_wait(unsigned long reason, pte_t pte, bool *ret) +{ + if ((reason & VM_USWAP) && (!pte_present(pte))) + *ret = true; +} + +#endif /* CONFIG_USERSWAP */ + +#endif /* _LINUX_USERSWAP_H */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 898ea134b2f398f08a9bed3a7915d470cb94a195..66c408ccc6c662a0bacba4ad8474df19707036f4 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -30,8 +30,6 @@ #define MAP_SYNC 0x080000 /* perform synchronous page faults for the mapping */ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ -#define MAP_REPLACE 0x1000000 - #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h index f55bc680b5b0a45eefca46732e265a8bfe2f46aa..174a1a2eb041018fde4906ede126d5626fac5353 100644 --- a/include/uapi/linux/mman.h +++ b/include/uapi/linux/mman.h @@ -8,6 +8,7 @@ #define MREMAP_MAYMOVE 1 #define MREMAP_FIXED 2 #define MREMAP_DONTUNMAP 4 +#define MREMAP_USWAP_SET_PTE 64 #define OVERCOMMIT_GUESS 0 #define OVERCOMMIT_ALWAYS 1 diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index ada058f8b94b3699d5f30e0229c62e1785a5f910..b8689050455d992373d8f2308766b1326d90f25b 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -127,6 +127,7 @@ struct uffd_msg { /* flags for UFFD_EVENT_PAGEFAULT */ #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ #define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ +#define UFFD_PAGEFAULT_FLAG_FPF (1<<10) /* If this was the first page fault */ struct uffdio_api { /* userland asks for an API number and the features to enable */ @@ -217,6 +218,7 @@ struct uffdio_copy { * according to the uffdio_register.ioctls. */ #define UFFDIO_COPY_MODE_WP ((__u64)1<<1) +#define UFFDIO_COPY_MODE_DIRECT_MAP ((__u64)1<<10) __u64 mode; /* diff --git a/mm/Makefile b/mm/Makefile index 696ee59c2ac758aaf7d5f5b98291af4522af1128..a014a5e08f7b6a011a27088ed197208f7d4ad442 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -113,6 +113,7 @@ obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o +obj-$(CONFIG_USERSWAP) += userswap.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o diff --git a/mm/memory.c b/mm/memory.c index 8f7d4531c7634a8f9bb7581720add64182095a95..5941a4f4ea4b154243a368f73f2925814f1cf45d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -73,6 +73,7 @@ #include <linux/perf_event.h> #include <linux/ptrace.h> #include <linux/vmalloc.h> +#include <linux/userswap.h> #include <trace/events/kmem.h> @@ -3395,22 +3396,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) entry = pte_to_swp_entry(vmf->orig_pte); #ifdef CONFIG_USERSWAP - if (swp_type(entry) == SWP_USERSWAP_ENTRY) { - /* print error if we come across a nested fault */ - if (!strncmp(current->comm, "uswap", 5)) { - pr_err("USWAP: fault %lx is triggered by %s\n", - vmf->address, current->comm); - return VM_FAULT_SIGBUS; - } - if (!(vma->vm_flags & VM_UFFD_MISSING)) { - pr_err("USWAP: addr %lx flags %lx is not a user swap page", - vmf->address, vma->vm_flags); - goto skip_uswap; - } - ret = handle_userfault(vmf, VM_UFFD_MISSING | VM_USWAP); + if (!do_uswap_page(entry, vmf, vma, &ret)) return ret; - } -skip_uswap: #endif if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { @@ -3689,6 +3676,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (ret) goto unlock; /* Deliver the page fault to userland, check inside PT lock */ +#ifdef CONFIG_USERSWAP + if (uswap_missing(vma)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return handle_userfault(vmf, VM_UFFD_MISSING|VM_USWAP); + } +#endif if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_MISSING); @@ -3731,6 +3724,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) goto release; /* Deliver the page fault to userland, check inside PT lock */ +#ifdef CONFIG_USERSWAP + if (uswap_missing(vma)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + put_page(page); + return handle_userfault(vmf, VM_UFFD_MISSING | VM_USWAP); + } +#endif if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); put_page(page); diff --git a/mm/mmap.c b/mm/mmap.c index b3694e09be0f0963679ed79005494a36c0dab7c7..bddd7f0f88b934ec923576c80809a380b7533eee 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1623,205 +1623,6 @@ __do_mmap(struct file *file, unsigned long addr, unsigned long len, { return __do_mmap_mm(current->mm, file, addr, len, prot, flags, vm_flags, pgoff, populate, uf); } -#ifdef CONFIG_USERSWAP -/* - * Check if pages between 'addr ~ addr+len' can be user swapped. If so, get - * the reference of the pages and return the pages through input parameters - * 'ppages'. - */ -static int pages_can_be_swapped(struct mm_struct *mm, unsigned long addr, - unsigned long len, struct page ***ppages) -{ - struct vm_area_struct *vma; - struct page *page = NULL; - struct page **pages = NULL; - unsigned long addr_end = addr + len; - unsigned long ret; - int i, page_num = 0; - - pages = kmalloc(sizeof(struct page *) * (len / PAGE_SIZE), GFP_KERNEL); - if (!pages) - return -ENOMEM; - - while (addr < addr_end) { - vma = find_vma(mm, addr); - if (!vma || !vma_is_anonymous(vma) || vma->vm_file || - (vma->vm_flags & VM_LOCKED) || (vma->vm_flags & VM_STACK) || - (vma->vm_flags & (VM_IO | VM_PFNMAP))) { - ret = -EINVAL; - goto out; - } - if (!(vma->vm_flags & VM_UFFD_MISSING)) { - ret = -EAGAIN; - goto out; - } -get_again: - /* follow_page will inc page ref, dec the ref after we remap the page */ - page = follow_page(vma, addr, FOLL_GET); - if (IS_ERR_OR_NULL(page)) { - ret = -ENODEV; - goto out; - } - pages[page_num++] = page; - if (!PageAnon(page) || !PageSwapBacked(page) || - PageHuge(page) || PageSwapCache(page)) { - ret = -EINVAL; - goto out; - } else if (PageTransCompound(page)) { - if (trylock_page(page)) { - if (!split_huge_page(page)) { - put_page(page); - page_num--; - unlock_page(page); - goto get_again; - } else { - unlock_page(page); - ret = -EINVAL; - goto out; - } - } else { - ret = -EINVAL; - goto out; - } - } - if (page_mapcount(page) > 1 || - page_mapcount(page) + 1 != page_count(page)) { - ret = -EBUSY; - goto out; - } - addr += PAGE_SIZE; - } - - *ppages = pages; - return 0; - -out: - for (i = 0; i < page_num; i++) - put_page(pages[i]); - if (pages) - kfree(pages); - *ppages = NULL; - return ret; -} - -/* - * In uswap situation, we use the bit 0 of the returned address to indicate - * whether the pages are dirty. - */ -#define USWAP_PAGES_DIRTY 1 - -/* unmap the pages between 'addr ~ addr+len' and remap them to a new address */ -static unsigned long -do_user_swap(struct mm_struct *mm, unsigned long addr_start, unsigned long len, - struct page **pages, unsigned long new_addr) -{ - struct vm_area_struct *vma; - struct page *page; - struct mmu_notifier_range range; - pmd_t *pmd; - pte_t *pte, old_pte; - spinlock_t *ptl; - unsigned long addr; - bool pages_dirty = false; - int i = 0; - - addr = addr_start; - lru_add_drain(); - i = 0; - while (addr < addr_start + len) { - page = pages[i]; - vma = find_vma(mm, addr); - if (!vma) - return -EINVAL; - - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, - vma->vm_mm, addr, addr + PAGE_SIZE); - mmu_notifier_invalidate_range_start(&range); - pmd = mm_find_pmd(mm, addr); - if (!pmd) { - mmu_notifier_invalidate_range_end(&range); - return -ENXIO; - } - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); - flush_cache_page(vma, addr, pte_pfn(*pte)); - old_pte = ptep_clear_flush(vma, addr, pte); - if (pte_dirty(old_pte) || PageDirty(page)) - pages_dirty = true; - set_pte(pte, swp_entry_to_pte(swp_entry(SWP_USERSWAP_ENTRY, - page_to_pfn(page)))); - dec_mm_counter(mm, MM_ANONPAGES); - reliable_page_counter(page, mm, -1); - page_remove_rmap(page, false); - put_page(page); - - pte_unmap_unlock(pte, ptl); - mmu_notifier_invalidate_range_end(&range); - vma->vm_flags |= VM_USWAP; - page->mapping = NULL; - addr += PAGE_SIZE; - i++; - } - - addr = new_addr; - vma = find_vma(mm, addr); - i = 0; - while (addr < new_addr + len) { - if (addr > vma->vm_end - 1) - vma = find_vma(mm, addr); - if (!vma) - return -ENODEV; - - page = pages[i++]; - if (vm_insert_page(vma, addr, page)) - return -EFAULT; - - addr += PAGE_SIZE; - } - vma->vm_flags |= VM_USWAP; - - if (pages_dirty) - new_addr = new_addr | USWAP_PAGES_DIRTY; - - return new_addr; -} - -static inline unsigned long -do_uswap_mmap(struct file *file, unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, unsigned long pgoff, - unsigned long *populate, struct list_head *uf) -{ - struct mm_struct *mm = current->mm; - unsigned long old_addr = addr; - struct page **pages = NULL; - unsigned long ret; - int i; - - if (!len || offset_in_page(addr) || (len % PAGE_SIZE)) - return -EINVAL; - - ret = pages_can_be_swapped(mm, addr, len, &pages); - if (ret) - return ret; - - /* mark the vma as special to avoid merging with other vmas */ - addr = __do_mmap(file, addr, len, prot, flags, VM_SPECIAL, pgoff, - populate, uf); - if (IS_ERR_VALUE(addr)) { - ret = addr; - goto out; - } - - ret = do_user_swap(mm, old_addr, len, pages, addr); -out: - /* follow_page() above increased the reference*/ - for (i = 0; i < len / PAGE_SIZE; i++) - put_page(pages[i]); - if (pages) - kfree(pages); - - return ret; -} -#endif /* * The caller must write-lock current->mm->mmap_lock. @@ -1831,11 +1632,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf) { -#ifdef CONFIG_USERSWAP - if (enable_userswap && (flags & MAP_REPLACE)) - return do_uswap_mmap(file, addr, len, prot, flags, pgoff, - populate, uf); -#endif return __do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, uf); } diff --git a/mm/mremap.c b/mm/mremap.c index 2f7f3494a990bc003369e3de46c2b6c922bf5931..b8b694be40bdcee5ded1c7b5e29a5cd73e08be2a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -25,6 +25,7 @@ #include <linux/mm-arch-hooks.h> #include <linux/userfaultfd_k.h> #include <linux/share_pool.h> +#include <linux/userswap.h> #include <asm/cacheflush.h> #include <asm/tlb.h> @@ -915,8 +916,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, */ addr = untagged_addr(addr); +#ifdef CONFIG_USERSWAP + if (!uswap_validate_mremap_flags(flags)) + return ret; +#else if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP)) return ret; +#endif if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE)) return ret; @@ -947,6 +953,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (!new_len) return ret; +#ifdef CONFIG_USERSWAP + if (flags & MREMAP_USWAP_SET_PTE) + return uswap_mremap(addr, old_len, new_addr, new_len); +#endif + if (mmap_write_lock_killable(current->mm)) return -EINTR; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 15c46208a2accb1b097f9ff6412b2b63f7400c4a..070359ee383a30490f759f4f77ae39d62fe2ef3e 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -15,6 +15,7 @@ #include <linux/mmu_notifier.h> #include <linux/hugetlb.h> #include <linux/shmem_fs.h> +#include <linux/userswap.h> #include <asm/tlbflush.h> #include "internal.h" @@ -90,10 +91,6 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, *pagep = NULL; } -#ifdef CONFIG_USERSWAP - if (dst_vma->vm_flags & VM_USWAP) - ClearPageDirty(page); -#endif /* * The memory barrier inside __SetPageUptodate makes sure that * preceding stores to the page contents become visible before @@ -112,10 +109,6 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, else _dst_pte = pte_mkwrite(_dst_pte); } -#ifdef CONFIG_USERSWAP - if (dst_vma->vm_flags & VM_USWAP) - _dst_pte = pte_mkclean(_dst_pte); -#endif dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); if (dst_vma->vm_file) { @@ -128,26 +121,9 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, goto out_release_uncharge_unlock; } -#ifdef CONFIG_USERSWAP - if (!(dst_vma->vm_flags & VM_USWAP)) { - ret = -EEXIST; - if (!pte_none(*dst_pte)) - goto out_release_uncharge_unlock; - } else { - /* - * The userspace may swap in a large area. Part of the area is - * not swapped out. Skip those pages. - */ - ret = 0; - if (swp_type(pte_to_swp_entry(*dst_pte)) != SWP_USERSWAP_ENTRY || - pte_present(*dst_pte)) - goto out_release_uncharge_unlock; - } -#else ret = -EEXIST; if (!pte_none(*dst_pte)) goto out_release_uncharge_unlock; -#endif inc_mm_counter(dst_mm, MM_ANONPAGES); reliable_page_counter(page, dst_mm, 1); @@ -535,6 +511,10 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, goto out_unlock; err = -EINVAL; +#ifdef CONFIG_USERSWAP + if (!uswap_check_copy(dst_vma, src_addr, len, mode)) + goto out_unlock; +#endif /* * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but * it will overwrite vm_ops, so vma_is_anonymous must return false. @@ -605,8 +585,17 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, BUG_ON(pmd_none(*dst_pmd)); BUG_ON(pmd_trans_huge(*dst_pmd)); - err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, - src_addr, &page, zeropage, wp_copy); +#ifdef CONFIG_USERSWAP + if (static_branch_unlikely(&userswap_enabled) && + dst_vma->vm_flags & VM_USWAP && + mode & UFFDIO_COPY_MODE_DIRECT_MAP) + err = mfill_atomic_pte_nocopy(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr); + else +#endif + err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr, &page, + zeropage, wp_copy); cond_resched(); if (unlikely(err == -ENOENT)) { diff --git a/mm/userswap.c b/mm/userswap.c new file mode 100644 index 0000000000000000000000000000000000000000..2d47f6ed9f9165200e0de0c2408e7b96fa6b9b5d --- /dev/null +++ b/mm/userswap.c @@ -0,0 +1,551 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2023. All rights reserved. + * + * userswap core file include swap-in and swap-out core function + */ + +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/rmap.h> +#include <linux/mmu_notifier.h> +#include <linux/userswap.h> +#include <linux/userfaultfd_k.h> +#include <linux/security.h> + +#include "internal.h" + +DEFINE_STATIC_KEY_FALSE(userswap_enabled); + +static bool vma_uswap_compatible(struct vm_area_struct *vma) +{ + if (!vma || !(vma->vm_flags & VM_USWAP) || !vma_is_anonymous(vma) || + vma->vm_file || vma->vm_flags & (VM_SHARED | VM_LOCKED | VM_STACK | + VM_IO | VM_PFNMAP)) + return false; + return true; +} + +static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + pgd = pgd_offset(mm, addr); + if (pgd_none_or_clear_bad(pgd)) + return NULL; + + p4d = p4d_offset(pgd, addr); + if (p4d_none_or_clear_bad(p4d)) + return NULL; + + pud = pud_offset(p4d, addr); + if (pud_none_or_clear_bad(pud)) + return NULL; + + return pud; +} + +static bool is_thp_or_huge(struct mm_struct *mm, unsigned long addr) +{ + pud_t *pud; + pmd_t *pmd; + + pud = get_old_pud(mm, addr); + if (!pud) + return false; + else if (pud_huge(*pud)) + return true; + + pmd = pmd_offset(pud, addr); + if (!pmd) + return false; + else if (pmd_huge(*pmd) || pmd_trans_huge(*pmd)) + return true; + + return false; +} + +/* + * Check if pages between 'addr ~ addr+len' can be user swapped. If so, get + * the reference of the pages and return the pages through input parameters + * 'ppages'. + */ +static unsigned long pages_can_be_swapped(struct mm_struct *mm, + unsigned long addr, + unsigned long len, + struct page ***ppages) +{ + struct vm_area_struct *vma; + struct page *page = NULL; + struct page **pages = NULL; + unsigned long addr_end = addr + len; + unsigned long ret; + unsigned long i, page_num = 0; + *ppages = NULL; + + + pages = kmalloc(sizeof(struct page *) * (len / PAGE_SIZE), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + while (addr < addr_end) { + vma = find_vma(mm, addr); + if (!vma || addr < vma->vm_start || + !vma_uswap_compatible(vma)) { + ret = -EINVAL; + goto out_err; + } + + if (!(vma->vm_flags & VM_UFFD_MISSING)) { + ret = -EAGAIN; + goto out_err; + } +get_again: + /* + * follow_page will inc page ref, dec the ref after we remap + * the page. + */ + page = follow_page(vma, addr, FOLL_GET); + if (IS_ERR_OR_NULL(page)) { + ret = -ENODEV; + goto out_err; + } + + pages[page_num++] = page; + if (!PageAnon(page) || !PageSwapBacked(page) || + PageHuge(page) || PageSwapCache(page)) { + ret = -EINVAL; + goto out_err; + } + + if (PageTransCompound(page)) { + if (trylock_page(page)) { + if (!split_huge_page(page)) { + unlock_page(page); + put_page(page); + page_num--; + goto get_again; + } else + unlock_page(page); + } + ret = -EINVAL; + goto out_err; + } + + /* + * Check that no O_DIRECT or similar I/O is in progress on the + * page + */ + if (page_mapcount(page) > 1) { + ret = -EBUSY; + goto out_err; + } + addr += PAGE_SIZE; + } + + *ppages = pages; + return 0; + +out_err: + for (i = 0; i < page_num; i++) + put_page(pages[i]); + kfree(pages); + return ret; +} + +static void uswap_unmap_anon_page(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, struct page *page, + pmd_t *pmd, pte_t *old_pte, + bool set_to_swp) +{ + struct mmu_notifier_range range; + spinlock_t *ptl; + pte_t *pte, _old_pte; + + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, + vma->vm_mm, addr, addr + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (pte_none(*pte)) + goto out_release_unlock; + flush_cache_page(vma, addr, pte_pfn(*pte)); + _old_pte = ptep_clear_flush(vma, addr, pte); + if (set_to_swp) + set_pte_at(mm, addr, pte, swp_entry_to_pte(swp_entry( + SWP_USERSWAP_ENTRY, page_to_pfn(page)))); + + dec_mm_counter(mm, MM_ANONPAGES); + reliable_page_counter(page, mm, -1); + page_remove_rmap(page, false); + +out_release_unlock: + pte_unmap_unlock(pte, ptl); + mmu_notifier_invalidate_range_end(&range); + page->mapping = NULL; + if (old_pte) + *old_pte = _old_pte; +} + +static void uswap_map_anon_page(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, + struct page *page, + pmd_t *pmd, + pte_t old_pte) +{ + spinlock_t *ptl; + pte_t *pte; + + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + flush_cache_page(vma, addr, pte_pfn(*pte)); + set_pte_at(mm, addr, pte, old_pte); + inc_mm_counter(mm, MM_ANONPAGES); + reliable_page_counter(page, mm, 1); + page_add_new_anon_rmap(page, vma, addr, false); + pte_unmap_unlock(pte, ptl); +} + +static unsigned long vm_insert_anon_page(struct vm_area_struct *vma, + unsigned long addr, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + int ret = 0; + pte_t *pte; + spinlock_t *ptl; + + if (unlikely(anon_vma_prepare(vma))) + return -ENOMEM; + + flush_dcache_page(page); + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + return -ENOMEM; + if (!pte_none(*pte)) { + ret = -EBUSY; + goto out_unlock; + } + + inc_mm_counter(mm, MM_ANONPAGES); + reliable_page_counter(page, mm, 1); + page_add_new_anon_rmap(page, vma, addr, false); + set_pte_at(mm, addr, pte, mk_pte(page, vma->vm_page_prot)); + +out_unlock: + pte_unmap_unlock(pte, ptl); + return ret; +} + +static void uswapout_recover(struct mm_struct *mm, + unsigned long old_addr_start, unsigned long len, + struct page **pages, unsigned long new_addr_start, + pte_t *ptes) +{ + unsigned long unmap_old_addr = old_addr_start; + unsigned long unmap_new_addr = new_addr_start; + struct page *page; + pmd_t *old_pmd, *new_pmd; + pte_t pte; + unsigned long i; + + for (i = 0; i < len; i++) { + page = pages[i]; + pte = ptes[i]; + new_pmd = mm_find_pmd(mm, new_addr_start); + old_pmd = mm_find_pmd(mm, unmap_old_addr); + + uswap_unmap_anon_page(mm, find_vma(mm, unmap_new_addr), + unmap_new_addr, page, new_pmd, NULL, + false); + uswap_map_anon_page(mm, find_vma(mm, unmap_old_addr), + unmap_old_addr, page, old_pmd, pte); + unmap_old_addr += PAGE_SIZE; + unmap_new_addr += PAGE_SIZE; + } + if (pte_val(ptes[len]) != 0) { + page = pages[len]; + pte = ptes[len]; + old_pmd = mm_find_pmd(mm, unmap_old_addr); + + uswap_map_anon_page(mm, find_vma(mm, unmap_old_addr), + unmap_old_addr, page, old_pmd, pte); + get_page(page); + } +} + +/* unmap the pages between 'addr ~ addr+len' and remap them to a new address */ +static unsigned long do_user_swap(struct mm_struct *mm, + unsigned long old_addr_start, + unsigned long len, struct page **pages, + unsigned long new_addr_start) +{ + struct vm_area_struct *old_vma, *new_vma; + unsigned long old_addr = old_addr_start; + unsigned long new_addr = new_addr_start; + struct page *page; + pmd_t *pmd; + pte_t old_pte, *ptes; + bool pages_dirty = false; + unsigned long i = 0, j; + int ret; + + ptes = kmalloc(sizeof(pte_t) * (len / PAGE_SIZE), GFP_KERNEL); + if (!ptes) + return -ENOMEM; + memset(ptes, 0, sizeof(pte_t) * (len / PAGE_SIZE)); + lru_add_drain(); + for (j = 0; j < len; j += PAGE_SIZE) { + page = pages[i]; + ret = -EINVAL; + if (!page) + goto out_recover; + if (is_thp_or_huge(mm, new_addr)) + goto out_recover; + old_vma = find_vma(mm, old_addr); + if (!old_vma || old_addr < old_vma->vm_start) + goto out_recover; + new_vma = find_vma(mm, new_addr); + if (!new_vma || new_addr < new_vma->vm_start) + goto out_recover; + + ret = -EACCES; + if (!(old_vma->vm_flags & VM_WRITE) && + (new_vma->vm_flags & VM_WRITE)) + goto out_recover; + + ret = -ENXIO; + pmd = mm_find_pmd(mm, old_addr); + if (!pmd) + goto out_recover; + uswap_unmap_anon_page(mm, old_vma, old_addr, page, pmd, + &old_pte, true); + ptes[i] = old_pte; + if (pte_dirty(old_pte) || PageDirty(page)) + pages_dirty = true; + put_page(page); + + ret = vm_insert_anon_page(new_vma, new_addr, page); + if (ret) + goto out_recover; + get_page(page); + + old_addr += PAGE_SIZE; + new_addr += PAGE_SIZE; + i++; + } + + if (pages_dirty) + new_addr_start = new_addr_start | USWAP_PAGES_DIRTY; + kfree(ptes); + return new_addr_start; + +out_recover: + uswapout_recover(mm, old_addr_start, i, pages, new_addr_start, ptes); + kfree(ptes); + return ret; +} + + +/* + * When flags is MREMAP_USWAP_SET_PTE, uswap_mremap() is called in syscall + * mremap. + * Unmap the pages between 'addr ~addr+old_len' and remap them to 'new_addr + * ~ new_addr+new_len'. Set the pte of old_addr to SWP_USERSWAP_ENTRY. + */ +unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len, + unsigned long new_addr, unsigned long new_len) +{ + struct page **pages = NULL; + struct mm_struct *mm = current->mm; + unsigned long len = old_len; + unsigned long ret = -EINVAL; + unsigned long i; + + if (!len || old_len != new_len || offset_in_page(old_addr) || + offset_in_page(new_addr) || (len % PAGE_SIZE)) + return ret; + + if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len || + old_addr > TASK_SIZE - old_len) + return ret; + + /* Ensure the old/new locations do not overlap */ + if (old_addr + old_len > new_addr && new_addr + new_len > old_addr) + return ret; + + down_read(&mm->mmap_lock); + ret = pages_can_be_swapped(mm, old_addr, len, &pages); + if (ret) { + up_read(&mm->mmap_lock); + return ret; + } + + ret = do_user_swap(mm, old_addr, len, pages, new_addr); + up_read(&mm->mmap_lock); + /* follow_page() above increased the reference*/ + for (i = 0; i < len / PAGE_SIZE; i++) + if (pages[i]) + put_page(pages[i]); + kfree(pages); + return ret; +} + +int mfill_atomic_pte_nocopy(struct mm_struct *mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr) +{ + struct vm_area_struct *src_vma; + pte_t dst_pte, *pte, src_pte; + pmd_t *src_pmd; + spinlock_t *ptl; + int ret = 0; + struct page *page; + + src_vma = find_vma(mm, src_addr); + if (!src_vma || src_addr < src_vma->vm_start) + return -ENOENT; + + if (src_vma->vm_flags & VM_LOCKED) + return -EINVAL; + + page = follow_page(src_vma, src_addr, FOLL_GET | FOLL_MIGRATION); + if (!page) + return -ENODEV; + + src_pmd = mm_find_pmd(mm, src_addr); + if (!src_pmd) { + ret = -ENXIO; + goto out_put_page; + } + uswap_unmap_anon_page(mm, src_vma, src_addr, page, src_pmd, &src_pte, + false); + + if (dst_vma->vm_flags & VM_USWAP) + ClearPageDirty(page); + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + dst_pte = mk_pte(page, dst_vma->vm_page_prot); + if (dst_vma->vm_flags & VM_WRITE) + dst_pte = pte_mkwrite(pte_mkdirty(dst_pte)); + if (dst_vma->vm_flags & VM_USWAP) + dst_pte = pte_mkclean(dst_pte); + + pte = pte_offset_map_lock(mm, dst_pmd, dst_addr, &ptl); + + /* + * The userspace may swap in a large area. Part of the area is not + * swapped out. If concurrent execution, PTE may be present. Skip those + * pages (pte_present). + * No other scenes should be handled except first pagefault (pte_none) + * and after userswap out (SWP_USERSWAP_ENTRY). + */ + if (pte_present(*pte) || (!pte_none(*pte) && + swp_type(pte_to_swp_entry(*pte)) != SWP_USERSWAP_ENTRY)) { + pte_unmap_unlock(pte, ptl); + uswap_map_anon_page(mm, src_vma, src_addr, page, src_pmd, + src_pte); + ret = -EEXIST; + goto out_put_page; + } + + inc_mm_counter(mm, MM_ANONPAGES); + reliable_page_counter(page, mm, 1); + page_add_new_anon_rmap(page, dst_vma, dst_addr, false); + set_pte_at(mm, dst_addr, pte, dst_pte); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, pte); + pte_unmap_unlock(pte, ptl); + +out_put_page: + put_page(page); + return ret; +} + +bool uswap_register(struct uffdio_register *uffdio_register, bool *uswap_mode) +{ + if (!static_branch_unlikely(&userswap_enabled)) + return true; + if (!(uffdio_register->mode & UFFDIO_REGISTER_MODE_USWAP)) + return true; + uffdio_register->mode &= ~UFFDIO_REGISTER_MODE_USWAP; + if (!uffdio_register->mode) + return false; + *uswap_mode = true; + return true; +} + +/* + * register the whole vma overlapping with the address range to avoid splitting + * the vma which could reduce fragmentation. + */ +bool uswap_adjust_uffd_range(struct uffdio_register *uffdio_register, + unsigned long *vm_flags, struct mm_struct *mm) +{ + struct vm_area_struct *vma; + unsigned long end; + bool ret = false; + + if (!static_branch_unlikely(&userswap_enabled)) + return true; + end = uffdio_register->range.start + uffdio_register->range.len - 1; + + mmap_read_lock(mm); + vma = find_vma(mm, uffdio_register->range.start); + if (!vma || vma->vm_start >= end) + goto out_unlock; + uffdio_register->range.start = vma->vm_start; + vma = find_vma(mm, end); + if (vma && end >= vma->vm_start) + uffdio_register->range.len = vma->vm_end - uffdio_register->range.start; + + *vm_flags |= VM_USWAP; + + ret = true; +out_unlock: + mmap_read_unlock(mm); + return ret; +} + +bool do_uswap_page(swp_entry_t entry, struct vm_fault *vmf, + struct vm_area_struct *vma, vm_fault_t *ret) +{ + if (!static_branch_unlikely(&userswap_enabled)) + return true; + + if (swp_type(entry) != SWP_USERSWAP_ENTRY) + return true; + + /* print error if we come across a nested fault */ + if (!strncmp(current->comm, "uswap", 5)) { + pr_err("USWAP: fault %lx is triggered by %s\n", vmf->address, + current->comm); + *ret = VM_FAULT_SIGBUS; + return false; + } + + if (!(vma->vm_flags & VM_UFFD_MISSING)) { + pr_err("USWAP: addr %lx flags %lx is not a user swap page", + vmf->address, vma->vm_flags); + return true; + } + + *ret = handle_userfault(vmf, VM_UFFD_MISSING | VM_USWAP); + return false; +} + +static int __init enable_userswap_setup(char *str) +{ + static_branch_enable(&userswap_enabled); + return 1; +} +__setup("enable_userswap", enable_userswap_setup);