提交 c97cdd7e 编写于 作者: Z ZhangPeng 提交者: Ma Wupeng

userswap: introduce MREMAP_USWAP_SET_PTE to remap for swapping out

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I6CAIM

--------------------------------

We introduce MREMAP_USWAP_SET_PTE to implement remapping in the swap-out
phase. Unmap the pages between 'addr ~ addr+old_len' and remap them to
'new_addr ~ new_addr+new_len'. During unmapping, the PTE of old_addr is
set to SWP_USERSWAP_ENTRY.
Signed-off-by: NZhangPeng <zhangpeng362@huawei.com>
上级 444ec524
......@@ -6,16 +6,28 @@
#ifndef _LINUX_USERSWAP_H
#define _LINUX_USERSWAP_H
#include <linux/mman.h>
#include <linux/userfaultfd.h>
#ifdef CONFIG_USERSWAP
extern int enable_userswap;
/*
* In uswap situation, we use the bit 0 of the returned address to indicate
* whether the pages are dirty.
*/
#define USWAP_PAGES_DIRTY 1
int mfill_atomic_pte_nocopy(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr);
unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len,
unsigned long new_addr, unsigned long new_len);
static inline bool uswap_check_copy_mode(struct vm_area_struct *vma, __u64 mode)
{
if (!(vma->vm_flags & VM_USWAP) && (mode & UFFDIO_COPY_MODE_DIRECT_MAP))
......@@ -23,6 +35,18 @@ static inline bool uswap_check_copy_mode(struct vm_area_struct *vma, __u64 mode)
return true;
}
static inline bool uswap_validate_mremap_flags(unsigned long flags)
{
if (!enable_userswap && flags & MREMAP_USWAP_SET_PTE)
return false;
if (flags & MREMAP_USWAP_SET_PTE && flags & ~MREMAP_USWAP_SET_PTE)
return false;
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP |
MREMAP_USWAP_SET_PTE))
return false;
return true;
}
#endif /* CONFIG_USERSWAP */
#endif /* _LINUX_USERSWAP_H */
......@@ -30,8 +30,6 @@
#define MAP_SYNC 0x080000 /* perform synchronous page faults for the mapping */
#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */
#define MAP_REPLACE 0x1000000
#define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be
* uninitialized */
......
......@@ -8,6 +8,7 @@
#define MREMAP_MAYMOVE 1
#define MREMAP_FIXED 2
#define MREMAP_DONTUNMAP 4
#define MREMAP_USWAP_SET_PTE 64
#define OVERCOMMIT_GUESS 0
#define OVERCOMMIT_ALWAYS 1
......
......@@ -49,7 +49,6 @@
#include <linux/sched/mm.h>
#include <linux/swapops.h>
#include <linux/share_pool.h>
#include <linux/userswap.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
......@@ -1623,205 +1622,6 @@ __do_mmap(struct file *file, unsigned long addr, unsigned long len,
{
return __do_mmap_mm(current->mm, file, addr, len, prot, flags, vm_flags, pgoff, populate, uf);
}
#ifdef CONFIG_USERSWAP
/*
* Check if pages between 'addr ~ addr+len' can be user swapped. If so, get
* the reference of the pages and return the pages through input parameters
* 'ppages'.
*/
static int pages_can_be_swapped(struct mm_struct *mm, unsigned long addr,
unsigned long len, struct page ***ppages)
{
struct vm_area_struct *vma;
struct page *page = NULL;
struct page **pages = NULL;
unsigned long addr_end = addr + len;
unsigned long ret;
int i, page_num = 0;
pages = kmalloc(sizeof(struct page *) * (len / PAGE_SIZE), GFP_KERNEL);
if (!pages)
return -ENOMEM;
while (addr < addr_end) {
vma = find_vma(mm, addr);
if (!vma || !vma_is_anonymous(vma) || vma->vm_file ||
(vma->vm_flags & VM_LOCKED) || (vma->vm_flags & VM_STACK) ||
(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
ret = -EINVAL;
goto out;
}
if (!(vma->vm_flags & VM_UFFD_MISSING)) {
ret = -EAGAIN;
goto out;
}
get_again:
/* follow_page will inc page ref, dec the ref after we remap the page */
page = follow_page(vma, addr, FOLL_GET);
if (IS_ERR_OR_NULL(page)) {
ret = -ENODEV;
goto out;
}
pages[page_num++] = page;
if (!PageAnon(page) || !PageSwapBacked(page) ||
PageHuge(page) || PageSwapCache(page)) {
ret = -EINVAL;
goto out;
} else if (PageTransCompound(page)) {
if (trylock_page(page)) {
if (!split_huge_page(page)) {
put_page(page);
page_num--;
unlock_page(page);
goto get_again;
} else {
unlock_page(page);
ret = -EINVAL;
goto out;
}
} else {
ret = -EINVAL;
goto out;
}
}
if (page_mapcount(page) > 1 ||
page_mapcount(page) + 1 != page_count(page)) {
ret = -EBUSY;
goto out;
}
addr += PAGE_SIZE;
}
*ppages = pages;
return 0;
out:
for (i = 0; i < page_num; i++)
put_page(pages[i]);
if (pages)
kfree(pages);
*ppages = NULL;
return ret;
}
/*
* In uswap situation, we use the bit 0 of the returned address to indicate
* whether the pages are dirty.
*/
#define USWAP_PAGES_DIRTY 1
/* unmap the pages between 'addr ~ addr+len' and remap them to a new address */
static unsigned long
do_user_swap(struct mm_struct *mm, unsigned long addr_start, unsigned long len,
struct page **pages, unsigned long new_addr)
{
struct vm_area_struct *vma;
struct page *page;
struct mmu_notifier_range range;
pmd_t *pmd;
pte_t *pte, old_pte;
spinlock_t *ptl;
unsigned long addr;
bool pages_dirty = false;
int i = 0;
addr = addr_start;
lru_add_drain();
i = 0;
while (addr < addr_start + len) {
page = pages[i];
vma = find_vma(mm, addr);
if (!vma)
return -EINVAL;
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma,
vma->vm_mm, addr, addr + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
pmd = mm_find_pmd(mm, addr);
if (!pmd) {
mmu_notifier_invalidate_range_end(&range);
return -ENXIO;
}
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
flush_cache_page(vma, addr, pte_pfn(*pte));
old_pte = ptep_clear_flush(vma, addr, pte);
if (pte_dirty(old_pte) || PageDirty(page))
pages_dirty = true;
set_pte(pte, swp_entry_to_pte(swp_entry(SWP_USERSWAP_ENTRY,
page_to_pfn(page))));
dec_mm_counter(mm, MM_ANONPAGES);
reliable_page_counter(page, mm, -1);
page_remove_rmap(page, false);
put_page(page);
pte_unmap_unlock(pte, ptl);
mmu_notifier_invalidate_range_end(&range);
vma->vm_flags |= VM_USWAP;
page->mapping = NULL;
addr += PAGE_SIZE;
i++;
}
addr = new_addr;
vma = find_vma(mm, addr);
i = 0;
while (addr < new_addr + len) {
if (addr > vma->vm_end - 1)
vma = find_vma(mm, addr);
if (!vma)
return -ENODEV;
page = pages[i++];
if (vm_insert_page(vma, addr, page))
return -EFAULT;
addr += PAGE_SIZE;
}
vma->vm_flags |= VM_USWAP;
if (pages_dirty)
new_addr = new_addr | USWAP_PAGES_DIRTY;
return new_addr;
}
static inline unsigned long
do_uswap_mmap(struct file *file, unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags, unsigned long pgoff,
unsigned long *populate, struct list_head *uf)
{
struct mm_struct *mm = current->mm;
unsigned long old_addr = addr;
struct page **pages = NULL;
unsigned long ret;
int i;
if (!len || offset_in_page(addr) || (len % PAGE_SIZE))
return -EINVAL;
ret = pages_can_be_swapped(mm, addr, len, &pages);
if (ret)
return ret;
/* mark the vma as special to avoid merging with other vmas */
addr = __do_mmap(file, addr, len, prot, flags, VM_SPECIAL, pgoff,
populate, uf);
if (IS_ERR_VALUE(addr)) {
ret = addr;
goto out;
}
ret = do_user_swap(mm, old_addr, len, pages, addr);
out:
/* follow_page() above increased the reference*/
for (i = 0; i < len / PAGE_SIZE; i++)
put_page(pages[i]);
if (pages)
kfree(pages);
return ret;
}
#endif
/*
* The caller must write-lock current->mm->mmap_lock.
......@@ -1831,11 +1631,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long flags, unsigned long pgoff,
unsigned long *populate, struct list_head *uf)
{
#ifdef CONFIG_USERSWAP
if (enable_userswap && (flags & MAP_REPLACE))
return do_uswap_mmap(file, addr, len, prot, flags, pgoff,
populate, uf);
#endif
return __do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, uf);
}
......
......@@ -25,6 +25,7 @@
#include <linux/mm-arch-hooks.h>
#include <linux/userfaultfd_k.h>
#include <linux/share_pool.h>
#include <linux/userswap.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
......@@ -915,8 +916,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
*/
addr = untagged_addr(addr);
#ifdef CONFIG_USERSWAP
if (!uswap_validate_mremap_flags(flags))
return ret;
#else
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
return ret;
#endif
if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
return ret;
......@@ -947,6 +953,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (!new_len)
return ret;
#ifdef CONFIG_USERSWAP
if (flags & MREMAP_USWAP_SET_PTE)
return uswap_mremap(addr, old_len, new_addr, new_len);
#endif
if (mmap_write_lock_killable(current->mm))
return -EINTR;
......
......@@ -9,15 +9,155 @@
#include <linux/swapops.h>
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
#include <linux/userswap.h>
#include "internal.h"
int enable_userswap;
static bool vma_uswap_compatible(struct vm_area_struct *vma)
{
if (!vma || !(vma->vm_flags & VM_USWAP) || !vma_is_anonymous(vma) ||
vma->vm_file || vma->vm_flags & (VM_SHARED | VM_LOCKED | VM_STACK |
VM_IO | VM_PFNMAP))
return false;
return true;
}
static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pgd = pgd_offset(mm, addr);
if (pgd_none_or_clear_bad(pgd))
return NULL;
p4d = p4d_offset(pgd, addr);
if (p4d_none_or_clear_bad(p4d))
return NULL;
pud = pud_offset(p4d, addr);
if (pud_none_or_clear_bad(pud))
return NULL;
return pud;
}
static bool is_thp_or_huge(struct mm_struct *mm, unsigned long addr)
{
pud_t *pud;
pmd_t *pmd;
pud = get_old_pud(mm, addr);
if (!pud)
return false;
else if (pud_huge(*pud))
return true;
pmd = pmd_offset(pud, addr);
if (!pmd)
return false;
else if (pmd_huge(*pmd) || pmd_trans_huge(*pmd))
return true;
return false;
}
/*
* Check if pages between 'addr ~ addr+len' can be user swapped. If so, get
* the reference of the pages and return the pages through input parameters
* 'ppages'.
*/
static unsigned long pages_can_be_swapped(struct mm_struct *mm,
unsigned long addr,
unsigned long len,
struct page ***ppages)
{
struct vm_area_struct *vma;
struct page *page = NULL;
struct page **pages = NULL;
unsigned long addr_end = addr + len;
unsigned long ret;
int i, page_num = 0;
*ppages = NULL;
pages = kmalloc(sizeof(struct page *) * (len / PAGE_SIZE), GFP_KERNEL);
if (!pages)
return -ENOMEM;
while (addr < addr_end) {
vma = find_vma(mm, addr);
if (!vma || addr < vma->vm_start ||
!vma_uswap_compatible(vma)) {
ret = -EINVAL;
goto out_err;
}
if (!(vma->vm_flags & VM_UFFD_MISSING)) {
ret = -EAGAIN;
goto out_err;
}
get_again:
/*
* follow_page will inc page ref, dec the ref after we remap
* the page.
*/
page = follow_page(vma, addr, FOLL_GET);
if (IS_ERR_OR_NULL(page)) {
ret = -ENODEV;
goto out_err;
}
pages[page_num++] = page;
if (!PageAnon(page) || !PageSwapBacked(page) ||
PageHuge(page) || PageSwapCache(page)) {
ret = -EINVAL;
goto out_err;
}
if (PageTransCompound(page)) {
if (trylock_page(page)) {
if (!split_huge_page(page)) {
unlock_page(page);
put_page(page);
page_num--;
goto get_again;
} else
unlock_page(page);
}
ret = -EINVAL;
goto out_err;
}
/*
* Check that no O_DIRECT or similar I/O is in progress on the
* page
*/
if (page_mapcount(page) > 1) {
ret = -EBUSY;
goto out_err;
}
addr += PAGE_SIZE;
}
*ppages = pages;
return 0;
out_err:
for (i = 0; i < page_num; i++)
put_page(pages[i]);
kfree(pages);
return ret;
}
static void uswap_unmap_anon_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long addr, struct page *page,
pmd_t *pmd, pte_t *old_pte)
pmd_t *pmd, pte_t *old_pte,
bool set_to_swp)
{
struct mmu_notifier_range range;
spinlock_t *ptl;
......@@ -31,6 +171,9 @@ static void uswap_unmap_anon_page(struct mm_struct *mm,
goto out_release_unlock;
flush_cache_page(vma, addr, pte_pfn(*pte));
*old_pte = ptep_clear_flush(vma, addr, pte);
if (set_to_swp)
set_pte_at(mm, addr, pte, swp_entry_to_pte(swp_entry(
SWP_USERSWAP_ENTRY, page_to_pfn(page))));
dec_mm_counter(mm, MM_ANONPAGES);
reliable_page_counter(page, mm, -1);
......@@ -61,6 +204,182 @@ static void uswap_map_anon_page(struct mm_struct *mm,
pte_unmap_unlock(pte, ptl);
}
static unsigned long vm_insert_anon_page(struct vm_area_struct *vma,
unsigned long addr, struct page *page)
{
struct mm_struct *mm = vma->vm_mm;
int ret = 0;
pte_t *pte;
spinlock_t *ptl;
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
flush_dcache_page(page);
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
return -ENOMEM;
if (!pte_none(*pte)) {
ret = -EBUSY;
goto out_unlock;
}
inc_mm_counter(mm, MM_ANONPAGES);
reliable_page_counter(page, mm, 1);
page_add_new_anon_rmap(page, vma, addr, false);
set_pte_at(mm, addr, pte, mk_pte(page, vma->vm_page_prot));
out_unlock:
pte_unmap_unlock(pte, ptl);
return ret;
}
static void uswapout_recover(struct mm_struct *mm,
unsigned long old_addr_start, unsigned long len,
struct page **pages, unsigned long new_addr_start,
pte_t *ptes)
{
unsigned long unmap_old_addr = old_addr_start;
unsigned long unmap_new_addr = new_addr_start;
struct page *page;
pmd_t *old_pmd, *new_pmd;
pte_t pte;
int i;
for (i = 0; i < len; i++) {
page = pages[i];
pte = ptes[i];
new_pmd = mm_find_pmd(mm, new_addr_start);
old_pmd = mm_find_pmd(mm, unmap_old_addr);
uswap_unmap_anon_page(mm, find_vma(mm, unmap_new_addr),
unmap_new_addr, page, new_pmd, NULL,
false);
uswap_map_anon_page(mm, find_vma(mm, unmap_old_addr),
unmap_old_addr, page, old_pmd, pte);
unmap_old_addr += PAGE_SIZE;
unmap_new_addr += PAGE_SIZE;
}
if (pte_val(ptes[len]) != 0) {
page = pages[len];
pte = ptes[len];
old_pmd = mm_find_pmd(mm, unmap_old_addr);
uswap_map_anon_page(mm, find_vma(mm, unmap_old_addr),
unmap_old_addr, page, old_pmd, pte);
get_page(page);
}
}
/* unmap the pages between 'addr ~ addr+len' and remap them to a new address */
static unsigned long do_user_swap(struct mm_struct *mm,
unsigned long old_addr_start,
unsigned long len, struct page **pages,
unsigned long new_addr_start)
{
struct vm_area_struct *old_vma, *new_vma;
unsigned long old_addr = old_addr_start;
unsigned long new_addr = new_addr_start;
struct page *page;
pmd_t *pmd;
pte_t old_pte, *ptes;
bool pages_dirty = false;
int i = 0, j;
int ret;
ptes = kmalloc(sizeof(pte_t) * (len / PAGE_SIZE), GFP_KERNEL);
if (!ptes)
return -ENOMEM;
memset(ptes, 0, sizeof(pte_t) * (len / PAGE_SIZE));
lru_add_drain();
for (j = 0; j < len; j += PAGE_SIZE) {
page = pages[i];
ret = -EINVAL;
if (!page)
goto out_recover;
if (is_thp_or_huge(mm, new_addr))
goto out_recover;
old_vma = find_vma(mm, old_addr);
if (!old_vma || old_addr < old_vma->vm_start)
goto out_recover;
new_vma = find_vma(mm, new_addr);
if (!new_vma || new_addr < new_vma->vm_start)
goto out_recover;
ret = -EACCES;
if (pgprot_val(old_vma->vm_page_prot) !=
pgprot_val(new_vma->vm_page_prot))
goto out_recover;
ret = -ENXIO;
pmd = mm_find_pmd(mm, old_addr);
if (!pmd)
goto out_recover;
uswap_unmap_anon_page(mm, old_vma, old_addr, page, pmd,
&old_pte, true);
ptes[i] = old_pte;
if (pte_dirty(old_pte) || PageDirty(page))
pages_dirty = true;
put_page(page);
ret = vm_insert_anon_page(new_vma, new_addr, page);
if (ret)
goto out_recover;
get_page(page);
old_addr += PAGE_SIZE;
new_addr += PAGE_SIZE;
i++;
}
if (pages_dirty)
new_addr = new_addr | USWAP_PAGES_DIRTY;
kfree(ptes);
return new_addr_start;
out_recover:
uswapout_recover(mm, old_addr_start, i, pages, new_addr_start, ptes);
kfree(ptes);
return ret;
}
/*
* When flags is MREMAP_USWAP_SET_PTE, uswap_mremap() is called in syscall
* mremap.
* Unmap the pages between 'addr ~addr+old_len' and remap them to 'new_addr
* ~ new_addr+new_len'. Set the pte of old_addr to SWP_USERSWAP_ENTRY.
*/
unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len,
unsigned long new_addr, unsigned long new_len)
{
struct page **pages = NULL;
struct mm_struct *mm = current->mm;
unsigned long len = old_len;
unsigned long ret = -EINVAL;
int i;
if (!len || old_len != new_len || offset_in_page(old_addr) ||
(len % PAGE_SIZE))
return ret;
down_read(&mm->mmap_lock);
ret = pages_can_be_swapped(mm, old_addr, len, &pages);
if (ret) {
up_read(&mm->mmap_lock);
return ret;
}
ret = do_user_swap(mm, old_addr, len, pages, new_addr);
up_read(&mm->mmap_lock);
/* follow_page() above increased the reference*/
for (i = 0; i < len / PAGE_SIZE; i++)
if (pages[i])
put_page(pages[i]);
kfree(pages);
return ret;
}
int mfill_atomic_pte_nocopy(struct mm_struct *mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
......@@ -90,7 +409,8 @@ int mfill_atomic_pte_nocopy(struct mm_struct *mm,
ret = -ENXIO;
goto out_put_page;
}
uswap_unmap_anon_page(mm, src_vma, src_addr, page, src_pmd, &src_pte);
uswap_unmap_anon_page(mm, src_vma, src_addr, page, src_pmd, &src_pte,
false);
if (dst_vma->vm_flags & VM_USWAP)
ClearPageDirty(page);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册