diff --git a/include/linux/userswap.h b/include/linux/userswap.h index fe2c868851fbf0c538a1741aef6958f33fe8188c..82cc79584e4362078dda37ac7a586f0c41e6a99f 100644 --- a/include/linux/userswap.h +++ b/include/linux/userswap.h @@ -6,16 +6,28 @@ #ifndef _LINUX_USERSWAP_H #define _LINUX_USERSWAP_H +#include +#include + #ifdef CONFIG_USERSWAP extern int enable_userswap; +/* + * In uswap situation, we use the bit 0 of the returned address to indicate + * whether the pages are dirty. + */ +#define USWAP_PAGES_DIRTY 1 + int mfill_atomic_pte_nocopy(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr); +unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len, + unsigned long new_addr, unsigned long new_len); + static inline bool uswap_check_copy_mode(struct vm_area_struct *vma, __u64 mode) { if (!(vma->vm_flags & VM_USWAP) && (mode & UFFDIO_COPY_MODE_DIRECT_MAP)) @@ -23,6 +35,18 @@ static inline bool uswap_check_copy_mode(struct vm_area_struct *vma, __u64 mode) return true; } +static inline bool uswap_validate_mremap_flags(unsigned long flags) +{ + if (!enable_userswap && flags & MREMAP_USWAP_SET_PTE) + return false; + if (flags & MREMAP_USWAP_SET_PTE && flags & ~MREMAP_USWAP_SET_PTE) + return false; + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP | + MREMAP_USWAP_SET_PTE)) + return false; + return true; +} + #endif /* CONFIG_USERSWAP */ #endif /* _LINUX_USERSWAP_H */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 898ea134b2f398f08a9bed3a7915d470cb94a195..66c408ccc6c662a0bacba4ad8474df19707036f4 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -30,8 +30,6 @@ #define MAP_SYNC 0x080000 /* perform synchronous page faults for the mapping */ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ -#define MAP_REPLACE 0x1000000 - #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h index f55bc680b5b0a45eefca46732e265a8bfe2f46aa..174a1a2eb041018fde4906ede126d5626fac5353 100644 --- a/include/uapi/linux/mman.h +++ b/include/uapi/linux/mman.h @@ -8,6 +8,7 @@ #define MREMAP_MAYMOVE 1 #define MREMAP_FIXED 2 #define MREMAP_DONTUNMAP 4 +#define MREMAP_USWAP_SET_PTE 64 #define OVERCOMMIT_GUESS 0 #define OVERCOMMIT_ALWAYS 1 diff --git a/mm/mmap.c b/mm/mmap.c index a5867d0391530432101653f56991907dc9bea59b..d6f51da7aad8eed72c620ca9769018a67e623aa0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -49,7 +49,6 @@ #include #include #include -#include #include #include @@ -1623,205 +1622,6 @@ __do_mmap(struct file *file, unsigned long addr, unsigned long len, { return __do_mmap_mm(current->mm, file, addr, len, prot, flags, vm_flags, pgoff, populate, uf); } -#ifdef CONFIG_USERSWAP -/* - * Check if pages between 'addr ~ addr+len' can be user swapped. If so, get - * the reference of the pages and return the pages through input parameters - * 'ppages'. - */ -static int pages_can_be_swapped(struct mm_struct *mm, unsigned long addr, - unsigned long len, struct page ***ppages) -{ - struct vm_area_struct *vma; - struct page *page = NULL; - struct page **pages = NULL; - unsigned long addr_end = addr + len; - unsigned long ret; - int i, page_num = 0; - - pages = kmalloc(sizeof(struct page *) * (len / PAGE_SIZE), GFP_KERNEL); - if (!pages) - return -ENOMEM; - - while (addr < addr_end) { - vma = find_vma(mm, addr); - if (!vma || !vma_is_anonymous(vma) || vma->vm_file || - (vma->vm_flags & VM_LOCKED) || (vma->vm_flags & VM_STACK) || - (vma->vm_flags & (VM_IO | VM_PFNMAP))) { - ret = -EINVAL; - goto out; - } - if (!(vma->vm_flags & VM_UFFD_MISSING)) { - ret = -EAGAIN; - goto out; - } -get_again: - /* follow_page will inc page ref, dec the ref after we remap the page */ - page = follow_page(vma, addr, FOLL_GET); - if (IS_ERR_OR_NULL(page)) { - ret = -ENODEV; - goto out; - } - pages[page_num++] = page; - if (!PageAnon(page) || !PageSwapBacked(page) || - PageHuge(page) || PageSwapCache(page)) { - ret = -EINVAL; - goto out; - } else if (PageTransCompound(page)) { - if (trylock_page(page)) { - if (!split_huge_page(page)) { - put_page(page); - page_num--; - unlock_page(page); - goto get_again; - } else { - unlock_page(page); - ret = -EINVAL; - goto out; - } - } else { - ret = -EINVAL; - goto out; - } - } - if (page_mapcount(page) > 1 || - page_mapcount(page) + 1 != page_count(page)) { - ret = -EBUSY; - goto out; - } - addr += PAGE_SIZE; - } - - *ppages = pages; - return 0; - -out: - for (i = 0; i < page_num; i++) - put_page(pages[i]); - if (pages) - kfree(pages); - *ppages = NULL; - return ret; -} - -/* - * In uswap situation, we use the bit 0 of the returned address to indicate - * whether the pages are dirty. - */ -#define USWAP_PAGES_DIRTY 1 - -/* unmap the pages between 'addr ~ addr+len' and remap them to a new address */ -static unsigned long -do_user_swap(struct mm_struct *mm, unsigned long addr_start, unsigned long len, - struct page **pages, unsigned long new_addr) -{ - struct vm_area_struct *vma; - struct page *page; - struct mmu_notifier_range range; - pmd_t *pmd; - pte_t *pte, old_pte; - spinlock_t *ptl; - unsigned long addr; - bool pages_dirty = false; - int i = 0; - - addr = addr_start; - lru_add_drain(); - i = 0; - while (addr < addr_start + len) { - page = pages[i]; - vma = find_vma(mm, addr); - if (!vma) - return -EINVAL; - - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, - vma->vm_mm, addr, addr + PAGE_SIZE); - mmu_notifier_invalidate_range_start(&range); - pmd = mm_find_pmd(mm, addr); - if (!pmd) { - mmu_notifier_invalidate_range_end(&range); - return -ENXIO; - } - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); - flush_cache_page(vma, addr, pte_pfn(*pte)); - old_pte = ptep_clear_flush(vma, addr, pte); - if (pte_dirty(old_pte) || PageDirty(page)) - pages_dirty = true; - set_pte(pte, swp_entry_to_pte(swp_entry(SWP_USERSWAP_ENTRY, - page_to_pfn(page)))); - dec_mm_counter(mm, MM_ANONPAGES); - reliable_page_counter(page, mm, -1); - page_remove_rmap(page, false); - put_page(page); - - pte_unmap_unlock(pte, ptl); - mmu_notifier_invalidate_range_end(&range); - vma->vm_flags |= VM_USWAP; - page->mapping = NULL; - addr += PAGE_SIZE; - i++; - } - - addr = new_addr; - vma = find_vma(mm, addr); - i = 0; - while (addr < new_addr + len) { - if (addr > vma->vm_end - 1) - vma = find_vma(mm, addr); - if (!vma) - return -ENODEV; - - page = pages[i++]; - if (vm_insert_page(vma, addr, page)) - return -EFAULT; - - addr += PAGE_SIZE; - } - vma->vm_flags |= VM_USWAP; - - if (pages_dirty) - new_addr = new_addr | USWAP_PAGES_DIRTY; - - return new_addr; -} - -static inline unsigned long -do_uswap_mmap(struct file *file, unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, unsigned long pgoff, - unsigned long *populate, struct list_head *uf) -{ - struct mm_struct *mm = current->mm; - unsigned long old_addr = addr; - struct page **pages = NULL; - unsigned long ret; - int i; - - if (!len || offset_in_page(addr) || (len % PAGE_SIZE)) - return -EINVAL; - - ret = pages_can_be_swapped(mm, addr, len, &pages); - if (ret) - return ret; - - /* mark the vma as special to avoid merging with other vmas */ - addr = __do_mmap(file, addr, len, prot, flags, VM_SPECIAL, pgoff, - populate, uf); - if (IS_ERR_VALUE(addr)) { - ret = addr; - goto out; - } - - ret = do_user_swap(mm, old_addr, len, pages, addr); -out: - /* follow_page() above increased the reference*/ - for (i = 0; i < len / PAGE_SIZE; i++) - put_page(pages[i]); - if (pages) - kfree(pages); - - return ret; -} -#endif /* * The caller must write-lock current->mm->mmap_lock. @@ -1831,11 +1631,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf) { -#ifdef CONFIG_USERSWAP - if (enable_userswap && (flags & MAP_REPLACE)) - return do_uswap_mmap(file, addr, len, prot, flags, pgoff, - populate, uf); -#endif return __do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, uf); } diff --git a/mm/mremap.c b/mm/mremap.c index 2f7f3494a990bc003369e3de46c2b6c922bf5931..b8b694be40bdcee5ded1c7b5e29a5cd73e08be2a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -915,8 +916,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, */ addr = untagged_addr(addr); +#ifdef CONFIG_USERSWAP + if (!uswap_validate_mremap_flags(flags)) + return ret; +#else if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP)) return ret; +#endif if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE)) return ret; @@ -947,6 +953,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (!new_len) return ret; +#ifdef CONFIG_USERSWAP + if (flags & MREMAP_USWAP_SET_PTE) + return uswap_mremap(addr, old_len, new_addr, new_len); +#endif + if (mmap_write_lock_killable(current->mm)) return -EINTR; diff --git a/mm/userswap.c b/mm/userswap.c index fe33fda975d1232bbb6c5431890fb0c7c03d6660..dd212b1a02e6ed72b1d1864c1b9c833447690f9e 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -9,15 +9,155 @@ #include #include #include +#include #include "internal.h" int enable_userswap; +static bool vma_uswap_compatible(struct vm_area_struct *vma) +{ + if (!vma || !(vma->vm_flags & VM_USWAP) || !vma_is_anonymous(vma) || + vma->vm_file || vma->vm_flags & (VM_SHARED | VM_LOCKED | VM_STACK | + VM_IO | VM_PFNMAP)) + return false; + return true; +} + +static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + pgd = pgd_offset(mm, addr); + if (pgd_none_or_clear_bad(pgd)) + return NULL; + + p4d = p4d_offset(pgd, addr); + if (p4d_none_or_clear_bad(p4d)) + return NULL; + + pud = pud_offset(p4d, addr); + if (pud_none_or_clear_bad(pud)) + return NULL; + + return pud; +} + +static bool is_thp_or_huge(struct mm_struct *mm, unsigned long addr) +{ + pud_t *pud; + pmd_t *pmd; + + pud = get_old_pud(mm, addr); + if (!pud) + return false; + else if (pud_huge(*pud)) + return true; + + pmd = pmd_offset(pud, addr); + if (!pmd) + return false; + else if (pmd_huge(*pmd) || pmd_trans_huge(*pmd)) + return true; + + return false; +} + +/* + * Check if pages between 'addr ~ addr+len' can be user swapped. If so, get + * the reference of the pages and return the pages through input parameters + * 'ppages'. + */ +static unsigned long pages_can_be_swapped(struct mm_struct *mm, + unsigned long addr, + unsigned long len, + struct page ***ppages) +{ + struct vm_area_struct *vma; + struct page *page = NULL; + struct page **pages = NULL; + unsigned long addr_end = addr + len; + unsigned long ret; + int i, page_num = 0; + *ppages = NULL; + + + pages = kmalloc(sizeof(struct page *) * (len / PAGE_SIZE), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + while (addr < addr_end) { + vma = find_vma(mm, addr); + if (!vma || addr < vma->vm_start || + !vma_uswap_compatible(vma)) { + ret = -EINVAL; + goto out_err; + } + + if (!(vma->vm_flags & VM_UFFD_MISSING)) { + ret = -EAGAIN; + goto out_err; + } +get_again: + /* + * follow_page will inc page ref, dec the ref after we remap + * the page. + */ + page = follow_page(vma, addr, FOLL_GET); + if (IS_ERR_OR_NULL(page)) { + ret = -ENODEV; + goto out_err; + } + + pages[page_num++] = page; + if (!PageAnon(page) || !PageSwapBacked(page) || + PageHuge(page) || PageSwapCache(page)) { + ret = -EINVAL; + goto out_err; + } + + if (PageTransCompound(page)) { + if (trylock_page(page)) { + if (!split_huge_page(page)) { + unlock_page(page); + put_page(page); + page_num--; + goto get_again; + } else + unlock_page(page); + } + ret = -EINVAL; + goto out_err; + } + + /* + * Check that no O_DIRECT or similar I/O is in progress on the + * page + */ + if (page_mapcount(page) > 1) { + ret = -EBUSY; + goto out_err; + } + addr += PAGE_SIZE; + } + + *ppages = pages; + return 0; + +out_err: + for (i = 0; i < page_num; i++) + put_page(pages[i]); + kfree(pages); + return ret; +} + static void uswap_unmap_anon_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, - pmd_t *pmd, pte_t *old_pte) + pmd_t *pmd, pte_t *old_pte, + bool set_to_swp) { struct mmu_notifier_range range; spinlock_t *ptl; @@ -31,6 +171,9 @@ static void uswap_unmap_anon_page(struct mm_struct *mm, goto out_release_unlock; flush_cache_page(vma, addr, pte_pfn(*pte)); *old_pte = ptep_clear_flush(vma, addr, pte); + if (set_to_swp) + set_pte_at(mm, addr, pte, swp_entry_to_pte(swp_entry( + SWP_USERSWAP_ENTRY, page_to_pfn(page)))); dec_mm_counter(mm, MM_ANONPAGES); reliable_page_counter(page, mm, -1); @@ -61,6 +204,182 @@ static void uswap_map_anon_page(struct mm_struct *mm, pte_unmap_unlock(pte, ptl); } +static unsigned long vm_insert_anon_page(struct vm_area_struct *vma, + unsigned long addr, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + int ret = 0; + pte_t *pte; + spinlock_t *ptl; + + if (unlikely(anon_vma_prepare(vma))) + return -ENOMEM; + + flush_dcache_page(page); + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + return -ENOMEM; + if (!pte_none(*pte)) { + ret = -EBUSY; + goto out_unlock; + } + + inc_mm_counter(mm, MM_ANONPAGES); + reliable_page_counter(page, mm, 1); + page_add_new_anon_rmap(page, vma, addr, false); + set_pte_at(mm, addr, pte, mk_pte(page, vma->vm_page_prot)); + +out_unlock: + pte_unmap_unlock(pte, ptl); + return ret; +} + +static void uswapout_recover(struct mm_struct *mm, + unsigned long old_addr_start, unsigned long len, + struct page **pages, unsigned long new_addr_start, + pte_t *ptes) +{ + unsigned long unmap_old_addr = old_addr_start; + unsigned long unmap_new_addr = new_addr_start; + struct page *page; + pmd_t *old_pmd, *new_pmd; + pte_t pte; + int i; + + for (i = 0; i < len; i++) { + page = pages[i]; + pte = ptes[i]; + new_pmd = mm_find_pmd(mm, new_addr_start); + old_pmd = mm_find_pmd(mm, unmap_old_addr); + + uswap_unmap_anon_page(mm, find_vma(mm, unmap_new_addr), + unmap_new_addr, page, new_pmd, NULL, + false); + uswap_map_anon_page(mm, find_vma(mm, unmap_old_addr), + unmap_old_addr, page, old_pmd, pte); + unmap_old_addr += PAGE_SIZE; + unmap_new_addr += PAGE_SIZE; + } + if (pte_val(ptes[len]) != 0) { + page = pages[len]; + pte = ptes[len]; + old_pmd = mm_find_pmd(mm, unmap_old_addr); + + uswap_map_anon_page(mm, find_vma(mm, unmap_old_addr), + unmap_old_addr, page, old_pmd, pte); + get_page(page); + } +} + +/* unmap the pages between 'addr ~ addr+len' and remap them to a new address */ +static unsigned long do_user_swap(struct mm_struct *mm, + unsigned long old_addr_start, + unsigned long len, struct page **pages, + unsigned long new_addr_start) +{ + struct vm_area_struct *old_vma, *new_vma; + unsigned long old_addr = old_addr_start; + unsigned long new_addr = new_addr_start; + struct page *page; + pmd_t *pmd; + pte_t old_pte, *ptes; + bool pages_dirty = false; + int i = 0, j; + int ret; + + ptes = kmalloc(sizeof(pte_t) * (len / PAGE_SIZE), GFP_KERNEL); + if (!ptes) + return -ENOMEM; + memset(ptes, 0, sizeof(pte_t) * (len / PAGE_SIZE)); + lru_add_drain(); + for (j = 0; j < len; j += PAGE_SIZE) { + page = pages[i]; + ret = -EINVAL; + if (!page) + goto out_recover; + if (is_thp_or_huge(mm, new_addr)) + goto out_recover; + old_vma = find_vma(mm, old_addr); + if (!old_vma || old_addr < old_vma->vm_start) + goto out_recover; + new_vma = find_vma(mm, new_addr); + if (!new_vma || new_addr < new_vma->vm_start) + goto out_recover; + + ret = -EACCES; + if (pgprot_val(old_vma->vm_page_prot) != + pgprot_val(new_vma->vm_page_prot)) + goto out_recover; + + ret = -ENXIO; + pmd = mm_find_pmd(mm, old_addr); + if (!pmd) + goto out_recover; + uswap_unmap_anon_page(mm, old_vma, old_addr, page, pmd, + &old_pte, true); + ptes[i] = old_pte; + if (pte_dirty(old_pte) || PageDirty(page)) + pages_dirty = true; + put_page(page); + + ret = vm_insert_anon_page(new_vma, new_addr, page); + if (ret) + goto out_recover; + get_page(page); + + old_addr += PAGE_SIZE; + new_addr += PAGE_SIZE; + i++; + } + + if (pages_dirty) + new_addr = new_addr | USWAP_PAGES_DIRTY; + kfree(ptes); + return new_addr_start; + +out_recover: + uswapout_recover(mm, old_addr_start, i, pages, new_addr_start, ptes); + kfree(ptes); + return ret; +} + + +/* + * When flags is MREMAP_USWAP_SET_PTE, uswap_mremap() is called in syscall + * mremap. + * Unmap the pages between 'addr ~addr+old_len' and remap them to 'new_addr + * ~ new_addr+new_len'. Set the pte of old_addr to SWP_USERSWAP_ENTRY. + */ +unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len, + unsigned long new_addr, unsigned long new_len) +{ + struct page **pages = NULL; + struct mm_struct *mm = current->mm; + unsigned long len = old_len; + unsigned long ret = -EINVAL; + int i; + + if (!len || old_len != new_len || offset_in_page(old_addr) || + (len % PAGE_SIZE)) + return ret; + + down_read(&mm->mmap_lock); + ret = pages_can_be_swapped(mm, old_addr, len, &pages); + if (ret) { + up_read(&mm->mmap_lock); + return ret; + } + + ret = do_user_swap(mm, old_addr, len, pages, new_addr); + up_read(&mm->mmap_lock); + /* follow_page() above increased the reference*/ + for (i = 0; i < len / PAGE_SIZE; i++) + if (pages[i]) + put_page(pages[i]); + kfree(pages); + return ret; +} + int mfill_atomic_pte_nocopy(struct mm_struct *mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, @@ -90,7 +409,8 @@ int mfill_atomic_pte_nocopy(struct mm_struct *mm, ret = -ENXIO; goto out_put_page; } - uswap_unmap_anon_page(mm, src_vma, src_addr, page, src_pmd, &src_pte); + uswap_unmap_anon_page(mm, src_vma, src_addr, page, src_pmd, &src_pte, + false); if (dst_vma->vm_flags & VM_USWAP) ClearPageDirty(page);