提交 f0f2e730 编写于 作者: K Kemeng Shi 提交者: Zheng Zengkai

etmem: x86: support scan hugetlb of vm

euleros inclusion
category: feature
feature: etmem
bugzilla: https://gitee.com/openeuler/kernel/issues/I4OODH?from=project-issue
CVE: NA

-------------------------------------------------

1. add hugetlb_entry callback to report hugetlb page.
2. try to walk host page table when ept entry is not present.
3. add SCAN_AS_HUGE to report ept page in pmd level as host
hugetlb page may be splited into 4k ept page in vm.
4. add SCAN_IGN_HOST for user to ignore access from host.
Signed-off-by: NKemeng Shi <shikemeng@huawei.com>
Reviewed-by: Nlouhongxiang <louhongxiang@huawei.com>
Reviewed-by: NChen Wandun <chenwandun@huawei.com>
Signed-off-by: NZheng Zengkai <zhengzengkai@huawei.com>
上级 b861c9ed
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#include <asm/stage2_pgtable.h> #include <asm/stage2_pgtable.h>
#endif #endif
#include "etmem_scan.h" #include "etmem_scan.h"
#include <linux/hugetlb_inline.h>
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
/* /*
...@@ -289,8 +290,32 @@ static int page_idle_copy_user(struct page_idle_ctrl *pic, ...@@ -289,8 +290,32 @@ static int page_idle_copy_user(struct page_idle_ctrl *pic,
} }
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
static int vm_walk_host_range(unsigned long long start,
unsigned long end,
struct mm_walk *walk)
{
int ret;
struct page_idle_ctrl *pic = walk->private;
unsigned long tmp_gpa_to_hva = pic->gpa_to_hva;
pic->gpa_to_hva = 0;
local_irq_enable();
down_read(&walk->mm->mmap_lock);
local_irq_disable();
ret = walk_page_range(walk->mm, start + tmp_gpa_to_hva, end + tmp_gpa_to_hva,
walk->ops, walk->private);
up_read(&walk->mm->mmap_lock);
pic->gpa_to_hva = tmp_gpa_to_hva;
if (pic->flags & VM_SCAN_HOST) {
pic->restart_gpa -= tmp_gpa_to_hva;
pic->flags &= ~VM_SCAN_HOST;
}
return ret;
}
static int ept_pte_range(struct page_idle_ctrl *pic, static int ept_pte_range(struct page_idle_ctrl *pic,
pmd_t *pmd, unsigned long addr, unsigned long end) pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{ {
pte_t *pte; pte_t *pte;
enum ProcIdlePageType page_type; enum ProcIdlePageType page_type;
...@@ -300,9 +325,10 @@ static int ept_pte_range(struct page_idle_ctrl *pic, ...@@ -300,9 +325,10 @@ static int ept_pte_range(struct page_idle_ctrl *pic,
do { do {
if (KVM_CHECK_INVALID_SPTE(pte->pte)) { if (KVM_CHECK_INVALID_SPTE(pte->pte)) {
page_type = PTE_IDLE; page_type = PTE_IDLE;
} else if (!ept_pte_present(*pte)) } else if (!ept_pte_present(*pte)) {
page_type = PTE_HOLE; err = vm_walk_host_range(addr, end, walk);
else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, goto next;
} else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED,
(unsigned long *) &pte->pte)) (unsigned long *) &pte->pte))
page_type = PTE_IDLE; page_type = PTE_IDLE;
else { else {
...@@ -315,6 +341,7 @@ static int ept_pte_range(struct page_idle_ctrl *pic, ...@@ -315,6 +341,7 @@ static int ept_pte_range(struct page_idle_ctrl *pic,
} }
err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type); err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type);
next:
if (err) if (err)
break; break;
} while (pte++, addr += PAGE_SIZE, addr != end); } while (pte++, addr += PAGE_SIZE, addr != end);
...@@ -322,9 +349,30 @@ static int ept_pte_range(struct page_idle_ctrl *pic, ...@@ -322,9 +349,30 @@ static int ept_pte_range(struct page_idle_ctrl *pic,
return err; return err;
} }
static enum ProcIdlePageType ept_huge_accessed(pmd_t *pmd, unsigned long addr,
unsigned long end)
{
int accessed = PMD_IDLE;
pte_t *pte;
pte = pte_offset_kernel(pmd, addr);
do {
if (!KVM_CHECK_INVALID_SPTE(pte->pte))
continue;
if (!ept_pte_present(*pte))
continue;
if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED,
(unsigned long *)&pte->pte))
continue;
accessed = PMD_ACCESSED;
} while (pte++, addr += PAGE_SIZE, addr != end);
return accessed;
}
static int ept_pmd_range(struct page_idle_ctrl *pic, static int ept_pmd_range(struct page_idle_ctrl *pic,
pud_t *pud, unsigned long addr, unsigned long end) pud_t *pud, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{ {
pmd_t *pmd; pmd_t *pmd;
unsigned long next; unsigned long next;
...@@ -342,11 +390,15 @@ static int ept_pmd_range(struct page_idle_ctrl *pic, ...@@ -342,11 +390,15 @@ static int ept_pmd_range(struct page_idle_ctrl *pic,
next = pmd_addr_end(addr, end); next = pmd_addr_end(addr, end);
if (KVM_CHECK_INVALID_SPTE(pmd->pmd)) if (KVM_CHECK_INVALID_SPTE(pmd->pmd))
page_type = PMD_IDLE; page_type = PMD_IDLE;
else if (!ept_pmd_present(*pmd)) else if (!ept_pmd_present(*pmd)) {
page_type = PMD_HOLE; /* likely won't hit here */ err = vm_walk_host_range(addr, next, walk);
else if (!pmd_large(*pmd)) goto next;
page_type = pte_page_type; } else if (!pmd_large(*pmd)) {
else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, if (pic->flags & SCAN_AS_HUGE)
page_type = ept_huge_accessed(pmd, addr, next);
else
page_type = pte_page_type;
} else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED,
(unsigned long *)pmd)) (unsigned long *)pmd))
page_type = PMD_IDLE; page_type = PMD_IDLE;
else { else {
...@@ -360,7 +412,9 @@ static int ept_pmd_range(struct page_idle_ctrl *pic, ...@@ -360,7 +412,9 @@ static int ept_pmd_range(struct page_idle_ctrl *pic,
if (page_type != IDLE_PAGE_TYPE_MAX) if (page_type != IDLE_PAGE_TYPE_MAX)
err = pic_add_page(pic, addr, next, page_type); err = pic_add_page(pic, addr, next, page_type);
else else
err = ept_pte_range(pic, pmd, addr, next); err = ept_pte_range(pic, pmd, addr, next, walk);
next:
if (err) if (err)
break; break;
} while (pmd++, addr = next, addr != end); } while (pmd++, addr = next, addr != end);
...@@ -370,7 +424,8 @@ static int ept_pmd_range(struct page_idle_ctrl *pic, ...@@ -370,7 +424,8 @@ static int ept_pmd_range(struct page_idle_ctrl *pic,
static int ept_pud_range(struct page_idle_ctrl *pic, static int ept_pud_range(struct page_idle_ctrl *pic,
p4d_t *p4d, unsigned long addr, unsigned long end) p4d_t *p4d, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{ {
pud_t *pud; pud_t *pud;
unsigned long next; unsigned long next;
...@@ -381,15 +436,16 @@ static int ept_pud_range(struct page_idle_ctrl *pic, ...@@ -381,15 +436,16 @@ static int ept_pud_range(struct page_idle_ctrl *pic,
next = pud_addr_end(addr, end); next = pud_addr_end(addr, end);
if (!ept_pud_present(*pud)) { if (!ept_pud_present(*pud)) {
set_restart_gpa(next, "PUD_HOLE"); err = vm_walk_host_range(addr, next, walk);
continue; goto next;
} }
if (pud_large(*pud)) if (pud_large(*pud))
err = pic_add_page(pic, addr, next, PUD_PRESENT); err = pic_add_page(pic, addr, next, PUD_PRESENT);
else else
err = ept_pmd_range(pic, pud, addr, next); err = ept_pmd_range(pic, pud, addr, next, walk);
next:
if (err) if (err)
break; break;
} while (pud++, addr = next, addr != end); } while (pud++, addr = next, addr != end);
...@@ -398,7 +454,8 @@ static int ept_pud_range(struct page_idle_ctrl *pic, ...@@ -398,7 +454,8 @@ static int ept_pud_range(struct page_idle_ctrl *pic,
} }
static int ept_p4d_range(struct page_idle_ctrl *pic, static int ept_p4d_range(struct page_idle_ctrl *pic,
pgd_t *pgd, unsigned long addr, unsigned long end) pgd_t *pgd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{ {
p4d_t *p4d; p4d_t *p4d;
unsigned long next; unsigned long next;
...@@ -412,7 +469,7 @@ static int ept_p4d_range(struct page_idle_ctrl *pic, ...@@ -412,7 +469,7 @@ static int ept_p4d_range(struct page_idle_ctrl *pic,
continue; continue;
} }
err = ept_pud_range(pic, p4d, addr, next); err = ept_pud_range(pic, p4d, addr, next, walk);
if (err) if (err)
break; break;
} while (p4d++, addr = next, addr != end); } while (p4d++, addr = next, addr != end);
...@@ -420,10 +477,10 @@ static int ept_p4d_range(struct page_idle_ctrl *pic, ...@@ -420,10 +477,10 @@ static int ept_p4d_range(struct page_idle_ctrl *pic,
return err; return err;
} }
static int ept_page_range(struct page_idle_ctrl *pic, static int ept_page_range(struct page_idle_ctrl *pic,
unsigned long addr, unsigned long addr,
unsigned long end) unsigned long end,
struct mm_walk *walk)
{ {
struct kvm_vcpu *vcpu; struct kvm_vcpu *vcpu;
struct kvm_mmu *mmu; struct kvm_mmu *mmu;
...@@ -460,7 +517,7 @@ static int ept_page_range(struct page_idle_ctrl *pic, ...@@ -460,7 +517,7 @@ static int ept_page_range(struct page_idle_ctrl *pic,
continue; continue;
} }
err = ept_p4d_range(pic, pgd, addr, next); err = ept_p4d_range(pic, pgd, addr, next, walk);
if (err) if (err)
break; break;
} while (pgd++, addr = next, addr != end); } while (pgd++, addr = next, addr != end);
...@@ -692,8 +749,44 @@ static unsigned long vm_idle_find_gpa(struct page_idle_ctrl *pic, ...@@ -692,8 +749,44 @@ static unsigned long vm_idle_find_gpa(struct page_idle_ctrl *pic,
return INVALID_PAGE; return INVALID_PAGE;
} }
static int mm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long next,
struct mm_walk *walk);
static int vm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long next,
struct mm_walk *walk)
{
struct page_idle_ctrl *pic = walk->private;
pic->flags |= VM_SCAN_HOST;
return mm_idle_hugetlb_entry(pte, hmask, addr, next, walk);
}
static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long next, struct mm_walk *walk);
static int vm_idle_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct page_idle_ctrl *pic = walk->private;
pic->flags |= VM_SCAN_HOST;
return mm_idle_pmd_entry(pmd, addr, next, walk);
}
static int mm_idle_pud_entry(pud_t *pud, unsigned long addr,
unsigned long next, struct mm_walk *walk);
static int vm_idle_pud_entry(pud_t *pud, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct page_idle_ctrl *pic = walk->private;
pic->flags |= VM_SCAN_HOST;
return mm_idle_pud_entry(pud, addr, next, walk);
}
static int vm_idle_walk_hva_range(struct page_idle_ctrl *pic, static int vm_idle_walk_hva_range(struct page_idle_ctrl *pic,
unsigned long start, unsigned long end) unsigned long start, unsigned long end,
struct mm_walk *walk)
{ {
unsigned long gpa_addr; unsigned long gpa_addr;
unsigned long addr_range; unsigned long addr_range;
...@@ -728,7 +821,7 @@ static int vm_idle_walk_hva_range(struct page_idle_ctrl *pic, ...@@ -728,7 +821,7 @@ static int vm_idle_walk_hva_range(struct page_idle_ctrl *pic,
#ifdef CONFIG_ARM64 #ifdef CONFIG_ARM64
arm_page_range(pic, gpa_addr, gpa_addr + addr_range); arm_page_range(pic, gpa_addr, gpa_addr + addr_range);
#else #else
ept_page_range(pic, gpa_addr, gpa_addr + addr_range); ept_page_range(pic, gpa_addr, gpa_addr + addr_range, walk);
#endif #endif
va_end = pic->gpa_to_hva + gpa_addr + addr_range; va_end = pic->gpa_to_hva + gpa_addr + addr_range;
} }
...@@ -744,10 +837,14 @@ static int vm_idle_walk_hva_range(struct page_idle_ctrl *pic, ...@@ -744,10 +837,14 @@ static int vm_idle_walk_hva_range(struct page_idle_ctrl *pic,
return ret; return ret;
} }
static int mm_idle_test_walk(unsigned long start, unsigned long end,
struct mm_walk *walk);
static ssize_t vm_idle_read(struct file *file, char *buf, static ssize_t vm_idle_read(struct file *file, char *buf,
size_t count, loff_t *ppos) size_t count, loff_t *ppos)
{ {
struct mm_struct *mm = file->private_data; struct mm_struct *mm = file->private_data;
struct mm_walk mm_walk = {};
struct mm_walk_ops mm_walk_ops = {};
struct page_idle_ctrl *pic; struct page_idle_ctrl *pic;
unsigned long hva_start = *ppos; unsigned long hva_start = *ppos;
unsigned long hva_end = hva_start + (count << (3 + PAGE_SHIFT)); unsigned long hva_end = hva_start + (count << (3 + PAGE_SHIFT));
...@@ -760,7 +857,16 @@ static ssize_t vm_idle_read(struct file *file, char *buf, ...@@ -760,7 +857,16 @@ static ssize_t vm_idle_read(struct file *file, char *buf,
setup_page_idle_ctrl(pic, buf, count, file->f_flags); setup_page_idle_ctrl(pic, buf, count, file->f_flags);
pic->kvm = mm_kvm(mm); pic->kvm = mm_kvm(mm);
ret = vm_idle_walk_hva_range(pic, hva_start, hva_end); mm_walk_ops.pmd_entry = vm_idle_pmd_entry;
mm_walk_ops.pud_entry = vm_idle_pud_entry;
mm_walk_ops.hugetlb_entry = vm_idle_hugetlb_entry;
mm_walk_ops.test_walk = mm_idle_test_walk;
mm_walk.mm = mm;
mm_walk.ops = &mm_walk_ops;
mm_walk.private = pic;
ret = vm_idle_walk_hva_range(pic, hva_start, hva_end, &mm_walk);
if (ret) if (ret)
goto out_kvm; goto out_kvm;
...@@ -863,6 +969,8 @@ static int mm_idle_pte_range(struct page_idle_ctrl *pic, pmd_t *pmd, ...@@ -863,6 +969,8 @@ static int mm_idle_pte_range(struct page_idle_ctrl *pic, pmd_t *pmd,
do { do {
if (!pte_present(*pte)) if (!pte_present(*pte))
page_type = PTE_HOLE; page_type = PTE_HOLE;
else if (pic->flags & SCAN_IGN_HOST)
page_type = PTE_IDLE;
else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED,
(unsigned long *) &pte->pte)) (unsigned long *) &pte->pte))
page_type = PTE_IDLE; page_type = PTE_IDLE;
...@@ -878,6 +986,39 @@ static int mm_idle_pte_range(struct page_idle_ctrl *pic, pmd_t *pmd, ...@@ -878,6 +986,39 @@ static int mm_idle_pte_range(struct page_idle_ctrl *pic, pmd_t *pmd,
return err; return err;
} }
static inline unsigned long mask_to_size(unsigned long mask)
{
return ~mask + 1;
}
static int mm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long next,
struct mm_walk *walk)
{
struct page_idle_ctrl *pic = walk->private;
enum ProcIdlePageType page_type;
unsigned long start = addr & hmask; /* hugepage may be splited in vm */
int ret;
if (mask_to_size(hmask) == PUD_SIZE) {
page_type = PUD_PRESENT;
goto add_page;
}
if (!pte_present(*pte))
page_type = PMD_HOLE;
else if (pic->flags & SCAN_IGN_HOST)
page_type = PMD_IDLE;
else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, (unsigned long *)pte))
page_type = PMD_IDLE;
else
page_type = PMD_ACCESSED;
add_page:
ret = pic_add_page(pic, start, start + pagetype_size[page_type], page_type);
return ret;
}
static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long next, struct mm_walk *walk) unsigned long next, struct mm_walk *walk)
{ {
...@@ -907,7 +1048,8 @@ static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, ...@@ -907,7 +1048,8 @@ static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr,
else if (!mm_idle_pmd_large(*pmd)) else if (!mm_idle_pmd_large(*pmd))
page_type = pte_page_type; page_type = pte_page_type;
else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED,
(unsigned long *)pmd)) (unsigned long *)pmd) ||
pic->flags & SCAN_IGN_HOST)
page_type = PMD_IDLE; page_type = PMD_IDLE;
else else
page_type = PMD_ACCESSED; page_type = PMD_ACCESSED;
...@@ -945,6 +1087,8 @@ static int mm_idle_test_walk(unsigned long start, unsigned long end, ...@@ -945,6 +1087,8 @@ static int mm_idle_test_walk(unsigned long start, unsigned long end,
struct vm_area_struct *vma = walk->vma; struct vm_area_struct *vma = walk->vma;
if (vma->vm_file) { if (vma->vm_file) {
if (is_vm_hugetlb_page(vma))
return 0;
if ((vma->vm_flags & (VM_WRITE|VM_MAYSHARE)) == VM_WRITE) if ((vma->vm_flags & (VM_WRITE|VM_MAYSHARE)) == VM_WRITE)
return 0; return 0;
return 1; return 1;
...@@ -1038,6 +1182,7 @@ static ssize_t mm_idle_read(struct file *file, char *buf, ...@@ -1038,6 +1182,7 @@ static ssize_t mm_idle_read(struct file *file, char *buf,
mm_walk_ops->pmd_entry = mm_idle_pmd_entry; mm_walk_ops->pmd_entry = mm_idle_pmd_entry;
mm_walk_ops->pud_entry = mm_idle_pud_entry; mm_walk_ops->pud_entry = mm_idle_pud_entry;
mm_walk_ops->hugetlb_entry = mm_idle_hugetlb_entry;
mm_walk_ops->test_walk = mm_idle_test_walk; mm_walk_ops->test_walk = mm_idle_test_walk;
mm_walk.mm = mm; mm_walk.mm = mm;
...@@ -1057,6 +1202,29 @@ static ssize_t mm_idle_read(struct file *file, char *buf, ...@@ -1057,6 +1202,29 @@ static ssize_t mm_idle_read(struct file *file, char *buf,
return ret; return ret;
} }
static long page_scan_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
void __user *argp = (void __user *)arg;
unsigned int flags;
if (get_user(flags, (unsigned int __user *)argp))
return -EFAULT;
flags &= ALL_SCAN_FLAGS;
switch (cmd) {
case IDLE_SCAN_ADD_FLAGS:
filp->f_flags |= flags;
break;
case IDLE_SCAN_REMOVE_FLAGS:
filp->f_flags &= ~flags;
break;
default:
return -EOPNOTSUPP;
}
return 0;
}
extern struct file_operations proc_page_scan_operations; extern struct file_operations proc_page_scan_operations;
static int page_scan_entry(void) static int page_scan_entry(void)
...@@ -1065,6 +1233,7 @@ static int page_scan_entry(void) ...@@ -1065,6 +1233,7 @@ static int page_scan_entry(void)
proc_page_scan_operations.read = page_scan_read; proc_page_scan_operations.read = page_scan_read;
proc_page_scan_operations.open = page_scan_open; proc_page_scan_operations.open = page_scan_open;
proc_page_scan_operations.release = page_scan_release; proc_page_scan_operations.release = page_scan_release;
proc_page_scan_operations.unlocked_ioctl = page_scan_ioctl;
return 0; return 0;
} }
......
...@@ -2,10 +2,24 @@ ...@@ -2,10 +2,24 @@
#ifndef _PAGE_IDLE_H #ifndef _PAGE_IDLE_H
#define _PAGE_IDLE_H #define _PAGE_IDLE_H
#include <linux/types.h>
#define SCAN_HUGE_PAGE O_NONBLOCK /* only huge page */ #define SCAN_HUGE_PAGE O_NONBLOCK /* only huge page */
#define SCAN_SKIM_IDLE O_NOFOLLOW /* stop on PMD_IDLE_PTES */ #define SCAN_SKIM_IDLE O_NOFOLLOW /* stop on PMD_IDLE_PTES */
#define SCAN_DIRTY_PAGE O_NOATIME /* report pte/pmd dirty bit */ #define SCAN_DIRTY_PAGE O_NOATIME /* report pte/pmd dirty bit */
/* define to not used file flags */
#define SCAN_AS_HUGE 0100000000 /* treat normal page as hugepage in vm */
#define SCAN_IGN_HOST 0200000000 /* ignore host access when scan vm */
#define VM_SCAN_HOST 0400000000 /* scan and add host page for vm hole(internal) */
#define ALL_SCAN_FLAGS (SCAN_HUGE_PAGE | SCAN_SKIM_IDLE | SCAN_DIRTY_PAGE | \
SCAN_AS_HUGE | SCAN_IGN_HOST | VM_SCAN_HOST)
#define IDLE_SCAN_MAGIC 0x66
#define IDLE_SCAN_ADD_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x0, unsigned int)
#define IDLE_SCAN_REMOVE_FLAGS _IOW(IDLE_SCAN_MAGIC, 0x1, unsigned int)
enum ProcIdlePageType { enum ProcIdlePageType {
PTE_ACCESSED, /* 4k page */ PTE_ACCESSED, /* 4k page */
PMD_ACCESSED, /* 2M page */ PMD_ACCESSED, /* 2M page */
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册