diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index d269d1139f7ff228889c975be56cc11c755fb582..0d19adb40dc2e481a4336eabc1cdc0e966555899 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -327,6 +327,10 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, * Lockless access: we're in a wait_event so it's ok if it * changes under us. */ +#ifdef CONFIG_USERSWAP + if ((reason & VM_USWAP) && (!pte_present(*pte))) + ret = true; +#endif if (pte_none(*pte)) ret = true; pte_unmap(pte); @@ -1321,10 +1325,30 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, ret = -EINVAL; if (!uffdio_register.mode) goto out; + vm_flags = 0; +#ifdef CONFIG_USERSWAP + /* + * register the whole vma overlapping with the address range to avoid + * splitting the vma. + */ + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_USWAP) { + uffdio_register.mode &= ~UFFDIO_REGISTER_MODE_USWAP; + vm_flags |= VM_USWAP; + end = uffdio_register.range.start + uffdio_register.range.len - 1; + vma = find_vma(mm, uffdio_register.range.start); + if (!vma) + goto out; + uffdio_register.range.start = vma->vm_start; + + vma = find_vma(mm, end); + if (!vma) + goto out; + uffdio_register.range.len = vma->vm_end - uffdio_register.range.start; + } +#endif if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING| UFFDIO_REGISTER_MODE_WP)) goto out; - vm_flags = 0; if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) vm_flags |= VM_UFFD_MISSING; if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 37c9eba75c983a7b2488f1a18c2c0f1d520a5425..5912381ec7658710bf7508070879c7c1b9b015b2 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -47,7 +47,11 @@ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, static inline bool userfaultfd_missing(struct vm_area_struct *vma) { +#ifdef CONFIG_USERSWAP + return (vma->vm_flags & VM_UFFD_MISSING) && !(vma->vm_flags & VM_USWAP); +#else return vma->vm_flags & VM_UFFD_MISSING; +#endif } static inline bool userfaultfd_armed(struct vm_area_struct *vma) diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 48f1a7c2f1f056117ce80166fdd126d1e32c08b4..42e0f860e7f73fc11a2ba90a4757b7709f822766 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -190,6 +190,9 @@ struct uffdio_register { struct uffdio_range range; #define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0) #define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1) +#ifdef CONFIG_USERSWAP +#define UFFDIO_REGISTER_MODE_USWAP ((__u64)1<<2) +#endif __u64 mode; /* diff --git a/mm/memory.c b/mm/memory.c index 17f3016c7acd9ceeaed6b8db6e3d659a65b77b6e..dbf7fd76958a4f4857e2bf889e2241c9713fc2d1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2769,6 +2769,25 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out; entry = pte_to_swp_entry(vmf->orig_pte); +#ifdef CONFIG_USERSWAP + if (swp_type(entry) == SWP_USERSWAP_ENTRY) { + /* print error if we come across a nested fault */ + if (!strncmp(current->comm, "uswap", 5)) { + pr_err("USWAP: fault %lx is triggered by %s\n", + vmf->address, current->comm); + return VM_FAULT_SIGBUS; + } + if (!(vma->vm_flags & VM_UFFD_MISSING)) { + pr_err("USWAP: addr %lx flags %lx is not a user swap page", + vmf->address, vma->vm_flags); + goto skip_uswap; + } + BUG_ON(!(vma->vm_flags & VM_UFFD_MISSING)); + ret = handle_userfault(vmf, VM_UFFD_MISSING | VM_USWAP); + return ret; + } +skip_uswap: +#endif if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { migration_entry_wait(vma->vm_mm, vmf->pmd, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 7529d3fcc89990f061d11180ac0b97645e7bd4ab..cc6ea42d1ea8191b7dbe4546bf84e54732b0195f 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -60,6 +60,10 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, *pagep = NULL; } +#ifdef CONFIG_USERSWAP + if (dst_vma->vm_flags & VM_USWAP) + ClearPageDirty(page); +#endif /* * The memory barrier inside __SetPageUptodate makes sure that * preceeding stores to the page contents become visible before @@ -74,6 +78,10 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, _dst_pte = mk_pte(page, dst_vma->vm_page_prot); if (dst_vma->vm_flags & VM_WRITE) _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); +#ifdef CONFIG_USERSWAP + if (dst_vma->vm_flags & VM_USWAP) + _dst_pte = pte_mkclean(_dst_pte); +#endif dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); if (dst_vma->vm_file) { @@ -85,9 +93,27 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, if (unlikely(offset >= max_off)) goto out_release_uncharge_unlock; } + +#ifdef CONFIG_USERSWAP + if (!(dst_vma->vm_flags & VM_USWAP)) { + ret = -EEXIST; + if (!pte_none(*dst_pte)) + goto out_release_uncharge_unlock; + } else { + /* + * The userspace may swap in a large area. Part of the area is + * not swapped out. Skip those pages. + */ + ret = 0; + if (swp_type(pte_to_swp_entry(*dst_pte)) != SWP_USERSWAP_ENTRY || + pte_present(*dst_pte)) + goto out_release_uncharge_unlock; + } +#else ret = -EEXIST; if (!pte_none(*dst_pte)) goto out_release_uncharge_unlock; +#endif inc_mm_counter(dst_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, dst_vma, dst_addr, false);