From aee7326bb297102b24bed3dfbb5ba51bca3527e9 Mon Sep 17 00:00:00 2001 From: yanxiaodan Date: Mon, 10 Aug 2020 20:01:43 +0800 Subject: [PATCH] memory-scan initialization, derived from memory-optimizer which is located in https://github.com/intel/memory-optimizer/tree/master/kernel_module --- Kbuild | 2 + Makefile | 11 + README.en.md | 17 +- README.md | 7 +- ept_idle.c | 1038 ++++++++++++++++++++++++++++++++++++ ept_idle.h | 123 +++++ ept_idle_common.h | 33 ++ ept_idle_native_pagewalk.c | 465 ++++++++++++++++ ept_idle_native_pagewalk.h | 7 + tlb_flush.c | 288 ++++++++++ tlb_flush.h | 10 + 11 files changed, 1983 insertions(+), 18 deletions(-) create mode 100644 Kbuild create mode 100644 Makefile create mode 100644 ept_idle.c create mode 100644 ept_idle.h create mode 100644 ept_idle_common.h create mode 100644 ept_idle_native_pagewalk.c create mode 100644 ept_idle_native_pagewalk.h create mode 100644 tlb_flush.c create mode 100644 tlb_flush.h diff --git a/Kbuild b/Kbuild new file mode 100644 index 0000000..927fb51 --- /dev/null +++ b/Kbuild @@ -0,0 +1,2 @@ +obj-m := kvm_ept_idle.o +kvm_ept_idle-y := ept_idle.o ept_idle_native_pagewalk.o tlb_flush.o diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..0ba5bb8 --- /dev/null +++ b/Makefile @@ -0,0 +1,11 @@ +include Kbuild + +# KERNEL_SRC_DIR=/lib/modules/$(shell uname -r)/build +# MODULE_DIR=$(shell pwd) + +default: + $(MAKE) -C $(KERNEL_SRC_DIR) M=$(MODULE_DIR) modules + +clean: + rm -f *.o *.ko *.mod.c modules.order Module.symvers + diff --git a/README.en.md b/README.en.md index 8be96a1..dfdb595 100644 --- a/README.en.md +++ b/README.en.md @@ -1,22 +1,13 @@ # memory-scan #### Description -A kernel module for scaning page table of process/VMs - -#### Software Architecture -Software architecture description +memory-scan is derived from memory-optimizer which is located in https://github.com/intel/memory-optimizer/tree/master/kernel_module. memory-scan is a kernel module for scaning page table of process/VMs. #### Installation -1. xxxx -2. xxxx -3. xxxx - -#### Instructions - -1. xxxx -2. xxxx -3. xxxx +OpenEuler users can use memory-scan: +1. make +2. insmod memory_scan.ko #### Contribution diff --git a/README.md b/README.md index 5e04446..4a99cc7 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,8 @@ # memory-scan #### 介绍 -A kernel module for scaning page table of process/VMs - -#### 软件架构 -软件架构说明 - +memory-scan is derived from memory-optimizer which is located in https://github.com/intel/memory-optimizer/tree/master/kernel +_module. memory-scan is a kernel module for scaning page table of process/VMs. #### 安装教程 diff --git a/ept_idle.c b/ept_idle.c new file mode 100644 index 0000000..ba77c20 --- /dev/null +++ b/ept_idle.c @@ -0,0 +1,1038 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ept_idle.h" +#include "ept_idle_native_pagewalk.h" +#include "tlb_flush.h" + +/* #define DEBUG 1 */ + +/* + Fallback to false for kernel doens't support KVM_INVALID_SPTE + ept_idle can sitll work in this situation but the scan accuracy may drop, depends on + the access frequences of the workload. +*/ +#ifdef KVM_INVALID_SPTE + #define KVM_CHECK_INVALID_SPTE(val) (val) == KVM_INVALID_SPTE +#else + #define KVM_CHECK_INVALID_SPTE(val) (0) +#endif + + +#if LINUX_VERSION_CODE == KERNEL_VERSION(4, 17, 0) +# define pgtable_l5_enabled() (pgtable_l5_enabled) +#elif LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0) +# define pgtable_l5_enabled() (0) +#endif + + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 20, 0) +# define kvm_arch_mmu_pointer(vcpu) (vcpu->arch.mmu) +/*For RedHat 7.7 beta*/ +#elif LINUX_VERSION_CODE == KERNEL_VERSION(3, 10, 0) +# define kvm_arch_mmu_pointer(vcpu) (vcpu->arch.mmu) +#else +# define kvm_arch_mmu_pointer(vcpu) (&vcpu->arch.mmu) +#endif + + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 20, 0) +# define kvm_mmu_ad_disabled(mmu) (mmu->mmu_role.base.ad_disabled) +/*For RedHat 7.7 beta*/ +#elif LINUX_VERSION_CODE == KERNEL_VERSION(3, 10, 0) +# define kvm_mmu_ad_disabled(mmu) (mmu->mmu_role.base.ad_disabled) +#else +# define kvm_mmu_ad_disabled(mmu) (mmu->base_role.ad_disabled) +#endif + +#ifdef DEBUG + +#define debug_printk trace_printk + +#define set_restart_gpa(val, note) ({ \ + unsigned long old_val = eic->restart_gpa; \ + eic->restart_gpa = (val); \ + trace_printk("restart_gpa=%lx %luK %s %s %d\n", \ + (val), (eic->restart_gpa - old_val) >> 10, \ + note, __func__, __LINE__); \ +}) + +#define set_next_hva(val, note) ({ \ + unsigned long old_val = eic->next_hva; \ + eic->next_hva = (val); \ + trace_printk(" next_hva=%lx %luK %s %s %d\n", \ + (val), (eic->next_hva - old_val) >> 10, \ + note, __func__, __LINE__); \ +}) + +#else + +#define debug_printk(...) + +#define set_restart_gpa(val, note) ({ \ + eic->restart_gpa = (val); \ +}) + +#define set_next_hva(val, note) ({ \ + eic->next_hva = (val); \ +}) + +#endif + +static struct proc_dir_entry* dir_entry; + +static unsigned long pagetype_size[16] = { + [PTE_ACCESSED] = PAGE_SIZE, /* 4k page */ + [PMD_ACCESSED] = PMD_SIZE, /* 2M page */ + [PUD_PRESENT] = PUD_SIZE, /* 1G page */ + + [PTE_DIRTY] = PAGE_SIZE, + [PMD_DIRTY] = PMD_SIZE, + + [PTE_IDLE] = PAGE_SIZE, + [PMD_IDLE] = PMD_SIZE, + [PMD_IDLE_PTES] = PMD_SIZE, + + [PTE_HOLE] = PAGE_SIZE, + [PMD_HOLE] = PMD_SIZE, +}; + +static void u64_to_u8(uint64_t n, uint8_t *p) +{ + p += sizeof(uint64_t) - 1; + + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p = n; +} + +static void dump_eic(struct ept_idle_ctrl *eic) +{ + debug_printk("ept_idle_ctrl: pie_read=%d pie_read_max=%d buf_size=%d " + "bytes_copied=%d next_hva=%lx restart_gpa=%lx " + "gpa_to_hva=%lx\n", + eic->pie_read, + eic->pie_read_max, + eic->buf_size, + eic->bytes_copied, + eic->next_hva, + eic->restart_gpa, + eic->gpa_to_hva); +} + +static void eic_report_addr(struct ept_idle_ctrl *eic, unsigned long addr) +{ + unsigned long hva; + eic->kpie[eic->pie_read++] = PIP_CMD_SET_HVA; + hva = addr; + u64_to_u8(hva, &eic->kpie[eic->pie_read]); + eic->pie_read += sizeof(uint64_t); + debug_printk("eic_report_addr %lx\n", addr); + dump_eic(eic); +} + +static int eic_add_page(struct ept_idle_ctrl *eic, + unsigned long addr, + unsigned long next, + enum ProcIdlePageType page_type) +{ + int page_size = pagetype_size[page_type]; + + debug_printk("eic_add_page addr=%lx next=%lx " + "page_type=%d pagesize=%dK\n", + addr, next, (int)page_type, (int)page_size >> 10); + dump_eic(eic); + + /* align kernel/user vision of cursor position */ + next = round_up(next, page_size); + + if (!eic->pie_read || + addr + eic->gpa_to_hva != eic->next_hva) { + /* merge hole */ + if (page_type == PTE_HOLE || + page_type == PMD_HOLE) { + set_restart_gpa(next, "PTE_HOLE|PMD_HOLE"); + return 0; + } + + if (addr + eic->gpa_to_hva < eic->next_hva) { + debug_printk("ept_idle: addr moves backwards\n"); + WARN_ONCE(1, "ept_idle: addr moves backwards"); + } + + if (eic->pie_read + sizeof(uint64_t) + 2 >= eic->pie_read_max) { + set_restart_gpa(addr, "EPT_IDLE_KBUF_FULL"); + return EPT_IDLE_KBUF_FULL; + } + + eic_report_addr(eic, round_down(addr, page_size) + + eic->gpa_to_hva); + } else { + if (PIP_TYPE(eic->kpie[eic->pie_read - 1]) == page_type && + PIP_SIZE(eic->kpie[eic->pie_read - 1]) < 0xF) { + set_next_hva(next + eic->gpa_to_hva, "IN-PLACE INC"); + set_restart_gpa(next, "IN-PLACE INC"); + eic->kpie[eic->pie_read - 1]++; + WARN_ONCE(page_size < next-addr, "next-addr too large"); + return 0; + } + if (eic->pie_read >= eic->pie_read_max) { + set_restart_gpa(addr, "EPT_IDLE_KBUF_FULL"); + return EPT_IDLE_KBUF_FULL; + } + } + + set_next_hva(next + eic->gpa_to_hva, "NEW-ITEM"); + set_restart_gpa(next, "NEW-ITEM"); + eic->kpie[eic->pie_read] = PIP_COMPOSE(page_type, 1); + eic->pie_read++; + + return 0; +} + +// Borrowed fronm zhou, jianshi and modified by yy, thanks to jianshi. +static int get_mm_and_kvm_by_pid(pid_t nr, + struct mm_struct** mmp, + struct kvm** kvmp) +{ + struct task_struct* task; + struct files_struct* files; + struct kvm* kvm = NULL; + struct mm_struct* mm = NULL; + struct pid* pid; + int fd, max_fds; + + rcu_read_lock(); + + if(!(pid = find_vpid(nr))) { + rcu_read_unlock(); + printk(KERN_ERR"failed to get vpid for pid = %d\n", nr); + return -ESRCH; + } + + if(!(task = pid_task(pid, PIDTYPE_PID))){ + rcu_read_unlock(); + printk(KERN_ERR"failed to get task_struct for pid = %d\n", nr); + return -ESRCH; + } + + // kthread has no mm_struct* + mm = get_task_mm(task); + if (!mm) { + rcu_read_unlock(); + printk(KERN_ERR"faild to get mm_struct for pid = %d\n", nr); + return -ESRCH; + } + + files = task->files; + max_fds = files_fdtable(files)->max_fds; + for(fd = 0; fd < max_fds; fd++) { + struct file* file; + char buffer[32]; + char* fname; + + if(!(file = fcheck_files(files, fd))) + continue; + + fname = d_path(&(file->f_path), buffer, sizeof(buffer)); + if(fname < buffer || fname >= buffer + sizeof(buffer)) + continue; + + if(strcmp(fname, "anon_inode:kvm-vm") == 0) { + kvm = file->private_data; + if (kvm) + kvm_get_kvm(kvm); + break; + } + } + + rcu_read_unlock(); + *kvmp = kvm; + *mmp = mm; + + return 0; +} + + +static int ept_pte_range(struct ept_idle_ctrl *eic, + pmd_t *pmd, unsigned long addr, unsigned long end) +{ + pte_t *pte; + enum ProcIdlePageType page_type; + int err = 0; + + pte = pte_offset_kernel(pmd, addr); + do { + if (KVM_CHECK_INVALID_SPTE(pte->pte)) { + page_type = PTE_IDLE; + } else if (!ept_pte_present(*pte)) + page_type = PTE_HOLE; + else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, + (unsigned long *) &pte->pte)) + page_type = PTE_IDLE; + else { + page_type = PTE_ACCESSED; + if (eic->flags & SCAN_DIRTY_PAGE) { + if (test_and_clear_bit(_PAGE_BIT_EPT_DIRTY, + (unsigned long *) &pte->pte)) + page_type = PTE_DIRTY; + } + } + + err = eic_add_page(eic, addr, addr + PAGE_SIZE, page_type); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + return err; +} + +static int ept_pmd_range(struct ept_idle_ctrl *eic, + pud_t *pud, unsigned long addr, unsigned long end) +{ + pmd_t *pmd; + unsigned long next; + enum ProcIdlePageType page_type; + enum ProcIdlePageType pte_page_type; + int err = 0; + + if (eic->flags & SCAN_HUGE_PAGE) + pte_page_type = PMD_IDLE_PTES; + else + pte_page_type = IDLE_PAGE_TYPE_MAX; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (KVM_CHECK_INVALID_SPTE(pmd->pmd)) { + page_type = PMD_IDLE; + } else if (!ept_pmd_present(*pmd)) + page_type = PMD_HOLE; /* likely won't hit here */ + else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, + (unsigned long *)pmd)) { + if (pmd_large(*pmd)) + page_type = PMD_IDLE; + else if (eic->flags & SCAN_SKIM_IDLE) + page_type = PMD_IDLE_PTES; + else + page_type = pte_page_type; + } else if (pmd_large(*pmd)) { + page_type = PMD_ACCESSED; + if (eic->flags & SCAN_DIRTY_PAGE) { + if (test_and_clear_bit(_PAGE_BIT_EPT_DIRTY, + (unsigned long *) pmd)) + page_type = PMD_DIRTY; + } + + } else + page_type = pte_page_type; + + if (page_type != IDLE_PAGE_TYPE_MAX) + err = eic_add_page(eic, addr, next, page_type); + else + err = ept_pte_range(eic, pmd, addr, next); + if (err) + break; + } while (pmd++, addr = next, addr != end); + + return err; +} + +static int ept_pud_range(struct ept_idle_ctrl *eic, + p4d_t *p4d, unsigned long addr, unsigned long end) +{ + pud_t *pud; + unsigned long next; + int err = 0; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + + if (!ept_pud_present(*pud)) { + set_restart_gpa(next, "PUD_HOLE"); + continue; + } + + if (pud_large(*pud)) + err = eic_add_page(eic, addr, next, PUD_PRESENT); + else + err = ept_pmd_range(eic, pud, addr, next); + + if (err) + break; + } while (pud++, addr = next, addr != end); + + return err; +} + +static int ept_p4d_range(struct ept_idle_ctrl *eic, + pgd_t *pgd, unsigned long addr, unsigned long end) +{ + p4d_t *p4d; + unsigned long next; + int err = 0; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (!ept_p4d_present(*p4d)) { + set_restart_gpa(next, "P4D_HOLE"); + continue; + } + + err = ept_pud_range(eic, p4d, addr, next); + if (err) + break; + } while (p4d++, addr = next, addr != end); + + return err; +} + +static int ept_page_range(struct ept_idle_ctrl *eic, + unsigned long addr, + unsigned long end) +{ + struct kvm_vcpu *vcpu; + struct kvm_mmu *mmu; + pgd_t *ept_root; + pgd_t *pgd; + unsigned long next; + int err = 0; + + BUG_ON(addr >= end); + + spin_lock(&eic->kvm->mmu_lock); + + vcpu = kvm_get_vcpu(eic->kvm, 0); + if (!vcpu) { + spin_unlock(&eic->kvm->mmu_lock); + return -EINVAL; + } + + mmu = kvm_arch_mmu_pointer(vcpu); + if (!VALID_PAGE(mmu->root_hpa)) { + spin_unlock(&eic->kvm->mmu_lock); + return -EINVAL; + } + + ept_root = __va(mmu->root_hpa); + + spin_unlock(&eic->kvm->mmu_lock); + local_irq_disable(); + pgd = pgd_offset_pgd(ept_root, addr); + do { + next = pgd_addr_end(addr, end); + if (!ept_pgd_present(*pgd)) { + set_restart_gpa(next, "PGD_HOLE"); + continue; + } + + err = ept_p4d_range(eic, pgd, addr, next); + if (err) + break; + } while (pgd++, addr = next, addr != end); + local_irq_enable(); + return err; +} + +static int init_ept_idle_ctrl_buffer(struct ept_idle_ctrl *eic) +{ + eic->pie_read = 0; + eic->pie_read_max = min(EPT_IDLE_KBUF_SIZE, + eic->buf_size - eic->bytes_copied); + /* reserve space for PIP_CMD_SET_HVA in the end */ + eic->pie_read_max -= sizeof(uint64_t) + 1; + + /* + * Align with EPT_IDLE_KBUF_FULL + * logic in eic_add_page(), to avoid eic->pie_read = 0 when + * EPT_IDLE_KBUF_FULL happened. + */ + if (eic->pie_read_max <= sizeof(uint64_t) + 2) + return EPT_IDLE_KBUF_FULL; + + memset(eic->kpie, 0, sizeof(eic->kpie)); + return 0; +} + +static void setup_ept_idle_ctrl(struct ept_idle_ctrl *eic, void* buf, + int buf_size, unsigned int flags) +{ + eic->buf = buf; + eic->buf_size = buf_size; + eic->bytes_copied = 0; + eic->next_hva = 0; + eic->gpa_to_hva = 0; + eic->restart_gpa = 0; + eic->last_va = 0; + eic->flags = flags; +} + +static int ept_idle_copy_user(struct ept_idle_ctrl *eic, + unsigned long start, unsigned long end) +{ + int bytes_read; + int lc = 0; /* last copy? */ + int ret; + + debug_printk("ept_idle_copy_user %lx %lx\n", start, end); + dump_eic(eic); + + /* Break out of loop on no more progress. */ + if (!eic->pie_read) { + lc = 1; + if (start < end) + start = end; + } + + if (start >= end && start > eic->next_hva) { + set_next_hva(start, "TAIL-HOLE"); + eic_report_addr(eic, start); + } + + bytes_read = eic->pie_read; + if (!bytes_read) + return 1; + + ret = copy_to_user(eic->buf, eic->kpie, bytes_read); + if (ret) + return -EFAULT; + + eic->buf += bytes_read; + eic->bytes_copied += bytes_read; + if (eic->bytes_copied >= eic->buf_size) + return EPT_IDLE_BUF_FULL; + if (lc) + return lc; + ret = init_ept_idle_ctrl_buffer(eic); + if (ret) + return ret; + + cond_resched(); + return 0; +} + +/* + * Depending on whether hva falls in a memslot: + * + * 1) found => return gpa and remaining memslot size in *addr_range + * + * |<----- addr_range --------->| + * [ mem slot ] + * ^hva + * + * 2) not found => return hole size in *addr_range + * + * |<----- addr_range --------->| + * [ first mem slot above hva ] + * ^hva + * + * If hva is above all mem slots, *addr_range will be ~0UL. We can finish read(2). + */ +static unsigned long ept_idle_find_gpa(struct ept_idle_ctrl *eic, + unsigned long hva, + unsigned long *addr_range) +{ + struct kvm *kvm = eic->kvm; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + unsigned long hva_end; + gfn_t gfn; + + *addr_range = ~0UL; + mutex_lock(&kvm->slots_lock); + slots = kvm_memslots(eic->kvm); + kvm_for_each_memslot(memslot, slots) { + hva_end = memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT); + + if (hva >= memslot->userspace_addr && hva < hva_end) { + gpa_t gpa; + gfn = hva_to_gfn_memslot(hva, memslot); + *addr_range = hva_end - hva; + gpa = gfn_to_gpa(gfn); + debug_printk("ept_idle_find_gpa slot %lx=>%llx %lx=>%llx " + "delta %llx size %lx\n", + memslot->userspace_addr, + gfn_to_gpa(memslot->base_gfn), + hva, gpa, + hva - gpa, + memslot->npages << PAGE_SHIFT); + mutex_unlock(&kvm->slots_lock); + return gpa; + } + + if (memslot->userspace_addr > hva) + *addr_range = min(*addr_range, + memslot->userspace_addr - hva); + } + mutex_unlock(&kvm->slots_lock); + return INVALID_PAGE; +} + +static int ept_idle_supports_cpu(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + struct kvm_mmu *mmu; + int ret; + + vcpu = kvm_get_vcpu(kvm, 0); + if (!vcpu) + return -EINVAL; + + spin_lock(&kvm->mmu_lock); + mmu = kvm_arch_mmu_pointer(vcpu); + if (kvm_mmu_ad_disabled(mmu)) { + printk(KERN_NOTICE + "CPU does not support EPT A/D bits tracking\n"); + ret = -EINVAL; + } else if (mmu->shadow_root_level != 4 + (! !pgtable_l5_enabled())) { + printk(KERN_NOTICE "Unsupported EPT level %d\n", + mmu->shadow_root_level); + ret = -EINVAL; + } else + ret = 0; + spin_unlock(&kvm->mmu_lock); + + return ret; +} + +static int ept_idle_walk_hva_range(struct ept_idle_ctrl *eic, + unsigned long start, unsigned long end) +{ + unsigned long gpa_addr; + unsigned long addr_range; + unsigned long va_end; + int ret; + + ret = ept_idle_supports_cpu(eic->kvm); + if (ret) + return ret; + + ret = init_ept_idle_ctrl_buffer(eic); + if (ret) + return ret; + + for (; start < end;) { + gpa_addr = ept_idle_find_gpa(eic, start, &addr_range); + + if (gpa_addr == INVALID_PAGE) { + eic->gpa_to_hva = 0; + if (addr_range == ~0UL) /* beyond max virtual address */ { + set_restart_gpa(TASK_SIZE, "EOF"); + va_end = end; + } else { + start += addr_range; + set_restart_gpa(start, "OUT-OF-SLOT"); + va_end = start; + } + } else { + eic->gpa_to_hva = start - gpa_addr; + ept_page_range(eic, gpa_addr, gpa_addr + addr_range); + va_end = eic->gpa_to_hva + gpa_addr + addr_range; + } + + start = eic->restart_gpa + eic->gpa_to_hva; + ret = ept_idle_copy_user(eic, start, va_end); + if (ret) + break; + } + + if (eic->bytes_copied) + ret = 0; + return ret; +} + +static ssize_t mm_idle_read(struct file *file, char *buf, + size_t count, loff_t *ppos); + +static ssize_t ept_idle_read(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + struct ept_idle_ctrl *eic = file->private_data; + unsigned long hva_start = *ppos; + unsigned long hva_end = hva_start + (count << (3 + PAGE_SHIFT)); + int ret; + + if (!eic) { + printk(KERN_ERR"NULL eic instance\n"); + return -ENOMEM; + } + + if (hva_start >= TASK_SIZE) { + debug_printk("ept_idle_read past TASK_SIZE: %lx %lx\n", + hva_start, TASK_SIZE); + return 0; + } + + if (!eic->mm) + return -EINVAL; + + if (!eic->kvm) + return mm_idle_read(file, buf, count, ppos); + + if (hva_end <= hva_start) { + debug_printk("ept_idle_read past EOF: %lx %lx\n", + hva_start, hva_end); + return 0; + } + if (*ppos & (PAGE_SIZE - 1)) { + debug_printk("ept_idle_read unaligned ppos: %lx\n", + hva_start); + return -EINVAL; + } + if (count < EPT_IDLE_BUF_MIN) { + debug_printk("ept_idle_read small count: %lx\n", + (unsigned long)count); + return -EINVAL; + } + + setup_ept_idle_ctrl(eic, buf, count, file->f_flags); + + ret = ept_idle_walk_hva_range(eic, hva_start, hva_end); + if (ret) + goto out_kvm; + + ret = eic->bytes_copied; + *ppos = eic->next_hva; + debug_printk("ppos=%lx bytes_copied=%d\n", + eic->next_hva, ret); +out_kvm: + return ret; +} + +static int ept_idle_open(struct inode *inode, struct file *file) +{ + struct ept_idle_ctrl* eic; + + if (!try_module_get(THIS_MODULE)) { + file->private_data = NULL; + return -EBUSY; + } + + eic = kzalloc(sizeof(*eic), GFP_KERNEL); + file->private_data = eic; + if (!eic) { + printk(KERN_ERR"Failed to alloc ept_idle_ctrl \n"); + return -ENOMEM; + } + + return 0; +} + +static int ept_idle_release(struct inode *inode, struct file *file) +{ + struct kvm *kvm; + struct ept_idle_ctrl* eic = file->private_data; + int ret = 0; + + if (!eic) + goto out; + + if (eic->kvm) { + kvm = eic->kvm; + spin_lock(&kvm->mmu_lock); + kvm_flush_remote_tlbs(kvm); + spin_unlock(&kvm->mmu_lock); + + kvm_put_kvm(kvm); + } else if (eic->mm) { + copied_flush_tlb_mm_range(eic->mm, 0UL, TLB_FLUSH_ALL, 0UL, true); + } + + if (eic->mm) + mmput(eic->mm); + + kfree(eic); +out: + module_put(THIS_MODULE); + return ret; +} + +static int mm_idle_pte_range(struct ept_idle_ctrl *eic, pmd_t *pmd, + unsigned long addr, unsigned long next) +{ + enum ProcIdlePageType page_type; + pte_t *pte; + int err = 0; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_present(*pte)) + page_type = PTE_HOLE; + else if (!test_and_clear_bit(_PAGE_BIT_ACCESSED, + (unsigned long *) &pte->pte)) + page_type = PTE_IDLE; + else { + page_type = PTE_ACCESSED; + } + + err = eic_add_page(eic, addr, addr + PAGE_SIZE, page_type); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != next); + + return err; +} + +static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct ept_idle_ctrl *eic = walk->private; + enum ProcIdlePageType page_type; + enum ProcIdlePageType pte_page_type; + int err; + + /* + * Skip duplicate PMD_IDLE_PTES: when the PMD crosses VMA boundary, + * walk_page_range() can call on the same PMD twice. + */ + if ((addr & PMD_MASK) == (eic->last_va & PMD_MASK)) { + debug_printk("ignore duplicate addr %lx %lx\n", + addr, eic->last_va); + return 0; + } + eic->last_va = addr; + + if (eic->flags & SCAN_HUGE_PAGE) + pte_page_type = PMD_IDLE_PTES; + else + pte_page_type = IDLE_PAGE_TYPE_MAX; +#if 0 + if (!pmd_present(*pmd)) + page_type = PMD_HOLE; + else if (!test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pmd)) { + if (pmd_large(*pmd)) + page_type = PMD_IDLE; + else if (eic->flags & SCAN_SKIM_IDLE) + page_type = PMD_IDLE_PTES; + else + page_type = pte_page_type; + } else if (pmd_large(*pmd)) { + page_type = PMD_ACCESSED; + } else + page_type = pte_page_type; +#else + // don't clear A bit in PMD for 4K page, which conflicted with pmd_bad() + if (!pmd_present(*pmd)) + page_type = PMD_HOLE; + else if (!pmd_large(*pmd)) + page_type = pte_page_type; + else if (!test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pmd)) + page_type = PMD_IDLE; + else + page_type = PMD_ACCESSED; +#endif + if (page_type != IDLE_PAGE_TYPE_MAX) + err = eic_add_page(eic, addr, next, page_type); + else + err = mm_idle_pte_range(eic, pmd, addr, next); + + return err; +} + +static int mm_idle_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct ept_idle_ctrl *eic = walk->private; + + if ((addr & PUD_MASK) != (eic->last_va & PUD_MASK)) { + eic_add_page(eic, addr, next, PUD_PRESENT); + eic->last_va = addr; + } + return 1; +} + +static int mm_idle_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + + if (vma->vm_file) { + if ((vma->vm_flags & (VM_WRITE|VM_MAYSHARE)) == VM_WRITE) + return 0; + return 1; + } + + return 0; +} + +static int mm_idle_walk_range(struct ept_idle_ctrl *eic, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma; + int ret; + + ret = init_ept_idle_ctrl_buffer(eic); + if (ret) + return ret; + + for (; start < end;) + { + down_read(&walk->mm->mmap_sem); + vma = find_vma(walk->mm, start); + if (vma) { + if (end > vma->vm_start) { + local_irq_disable(); + ret = ept_idle_walk_page_range(start, end, walk); + local_irq_enable(); + } else + set_restart_gpa(vma->vm_start, "VMA-HOLE"); + } else + set_restart_gpa(TASK_SIZE, "EOF"); + up_read(&walk->mm->mmap_sem); + + WARN_ONCE(eic->gpa_to_hva, "non-zero gpa_to_hva"); + start = eic->restart_gpa; + ret = ept_idle_copy_user(eic, start, end); + if (ret) + break; + } + + if (eic->bytes_copied) { + if (ret != EPT_IDLE_BUF_FULL && eic->next_hva < end) + debug_printk("partial scan: next_hva=%lx end=%lx\n", + eic->next_hva, end); + ret = 0; + } else + WARN_ONCE(1, "nothing read"); + return ret; +} + +static ssize_t mm_idle_read(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + struct ept_idle_ctrl *eic = file->private_data; + struct mm_walk mm_walk = {}; + unsigned long va_start = *ppos; + unsigned long va_end = va_start + (count << (3 + PAGE_SHIFT)); + int ret; + + if (va_end <= va_start) { + debug_printk("mm_idle_read past EOF: %lx %lx\n", + va_start, va_end); + return 0; + } + if (*ppos & (PAGE_SIZE - 1)) { + debug_printk("mm_idle_read unaligned ppos: %lx\n", + va_start); + return -EINVAL; + } + if (count < EPT_IDLE_BUF_MIN) { + debug_printk("mm_idle_read small count: %lx\n", + (unsigned long)count); + return -EINVAL; + } + + setup_ept_idle_ctrl(eic, buf, count, file->f_flags); + + mm_walk.mm = eic->mm; + mm_walk.pmd_entry = mm_idle_pmd_entry; + mm_walk.pud_entry = mm_idle_pud_entry; + mm_walk.test_walk = mm_idle_test_walk; + mm_walk.private = eic; + + ret = mm_idle_walk_range(eic, va_start, va_end, &mm_walk); + if (ret) + goto out_mm; + + ret = eic->bytes_copied; + *ppos = eic->next_hva; + debug_printk("ppos=%lx bytes_copied=%d\n", + eic->next_hva, ret); +out_mm: + return ret; +} + +// copied from fs/proc/base.c mem_lseek +static loff_t ept_idle_lseek(struct file *file, loff_t offset, int orig) +{ + switch (orig) { + case 0: + file->f_pos = offset; + break; + case 1: + file->f_pos += offset; + break; + default: + return -EINVAL; + } + force_successful_syscall_return(); + return file->f_pos; +} + +static long ept_idle_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + struct ept_idle_ctrl* eic; + pid_t target_pid = (pid_t)arg; + long ret; + + eic = filp->private_data; + if (!eic) { + printk(KERN_ERR"NULL eic instance \n"); + return -ENOMEM; + } + + switch(ioctl) { + case IDLE_PAGE_SET_PID: + ret = get_mm_and_kvm_by_pid(target_pid, &eic->mm, &eic->kvm); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +struct file_operations proc_idle_page_oprations = { + .llseek = ept_idle_lseek, + .read = ept_idle_read, + .open = ept_idle_open, + .release = ept_idle_release, + .unlocked_ioctl = ept_idle_ioctl +}; + +static int ept_idle_entry(void) +{ + dir_entry = proc_create("idle_pages", S_IWUSR | S_IRUGO, NULL, + &proc_idle_page_oprations); + if (!dir_entry) { + printk("Failed to create idle_pages in /porc\n"); + return -ENOMEM; + } + + return 0; +} + +static void ept_idle_exit(void) +{ + if (dir_entry) + proc_remove(dir_entry); +} + +MODULE_LICENSE("GPL"); +module_init(ept_idle_entry); +module_exit(ept_idle_exit); diff --git a/ept_idle.h b/ept_idle.h new file mode 100644 index 0000000..c472eeb --- /dev/null +++ b/ept_idle.h @@ -0,0 +1,123 @@ +#ifndef _EPT_IDLE_H +#define _EPT_IDLE_H + +#include "ept_idle_common.h" + +#define SCAN_HUGE_PAGE O_NONBLOCK /* only huge page */ +#define SCAN_SKIM_IDLE O_NOFOLLOW /* stop on PMD_IDLE_PTES */ +#define SCAN_DIRTY_PAGE O_NOATIME /* report pte/pmd dirty bit */ + +enum ProcIdlePageType { + PTE_ACCESSED, /* 4k page */ + PMD_ACCESSED, /* 2M page */ + PUD_PRESENT, /* 1G page */ + + PTE_DIRTY, + PMD_DIRTY, + + PTE_IDLE, + PMD_IDLE, + PMD_IDLE_PTES, /* all PTE idle */ + + PTE_HOLE, + PMD_HOLE, + + PIP_CMD, + + IDLE_PAGE_TYPE_MAX +}; + +#define PIP_TYPE(a) (0xf & (a >> 4)) +#define PIP_SIZE(a) (0xf & a) +#define PIP_COMPOSE(type, nr) ((type << 4) | nr) + +#define PIP_CMD_SET_HVA PIP_COMPOSE(PIP_CMD, 0) + +#define _PAGE_BIT_EPT_ACCESSED 8 +#define _PAGE_BIT_EPT_DIRTY 9 +#define _PAGE_EPT_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_EPT_ACCESSED) +#define _PAGE_EPT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_EPT_DIRTY) + +#define _PAGE_EPT_PRESENT (_AT(pteval_t, 7)) + +static inline int ept_pte_present(pte_t a) +{ + return pte_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pmd_present(pmd_t a) +{ + return pmd_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pud_present(pud_t a) +{ + return pud_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_p4d_present(p4d_t a) +{ + return p4d_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pgd_present(pgd_t a) +{ + return pgd_flags(a) & _PAGE_EPT_PRESENT; +} + +static inline int ept_pte_accessed(pte_t a) +{ + return pte_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_pmd_accessed(pmd_t a) +{ + return pmd_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_pud_accessed(pud_t a) +{ + return pud_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_p4d_accessed(p4d_t a) +{ + return p4d_flags(a) & _PAGE_EPT_ACCESSED; +} + +static inline int ept_pgd_accessed(pgd_t a) +{ + return pgd_flags(a) & _PAGE_EPT_ACCESSED; +} + +extern struct file_operations proc_ept_idle_operations; + +#define EPT_IDLE_KBUF_FULL 1 +#define EPT_IDLE_BUF_FULL 2 +#define EPT_IDLE_BUF_MIN (sizeof(uint64_t) * 2 + 3) + +#define EPT_IDLE_KBUF_SIZE 8000 + +#define IDLE_PAGE_SET_PID _IOW(0x1, 0x1, pid_t) + +struct ept_idle_ctrl { + struct mm_struct *mm; + struct kvm *kvm; + + uint8_t kpie[EPT_IDLE_KBUF_SIZE]; + int pie_read; + int pie_read_max; + + void __user *buf; + int buf_size; + int bytes_copied; + + unsigned long next_hva; /* GPA for EPT; VA for PT */ + unsigned long gpa_to_hva; + unsigned long restart_gpa; + unsigned long last_va; + + unsigned int flags; +}; + +#endif diff --git a/ept_idle_common.h b/ept_idle_common.h new file mode 100644 index 0000000..ee9e915 --- /dev/null +++ b/ept_idle_common.h @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _EPT_IDLE_COMMON_H +#define _EPT_IDLE_COMMON_H + +/* Fix leak of 5 level paging supporting on old kernel*/ +#ifndef CONFIG_PGTABLE_LEVELS + #define EPT_IDLE_5_LEVEL_PGTABLE_SUPPORT +#else + #if CONFIG_PGTABLE_LEVELS < 4 + #define EPT_IDLE_5_LEVEL_PGTABLE_SUPPORT + #endif // #if CONFIG_PGTABLE_LEVELS < 4 +#endif // #ifndef CONFIG_PGTABLE_LEVELS + +#ifdef EPT_IDLE_5_LEVEL_PGTABLE_SUPPORT + +#define p4d_t pgd_t +#define p4d_flags pgd_flags +#define p4d_offset(pgd, start) (pgd) +#define p4d_addr_end(addr, end) (end) +#define p4d_present(p4d) 1 +#define p4d_ERROR(p4d) do { } while(0) +#define p4d_clear pgd_clear +#define p4d_none(p4d) 0 +#define p4d_bad(p4d) 0 +#define p4d_clear_bad pgd_clear_bad +#endif + +#ifndef pgd_offset_pgd +#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address))) +#endif + + +#endif diff --git a/ept_idle_native_pagewalk.c b/ept_idle_native_pagewalk.c new file mode 100644 index 0000000..fed7605 --- /dev/null +++ b/ept_idle_native_pagewalk.c @@ -0,0 +1,465 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copied from kernel mm/pagewalk.c, modified by yuan.yao@intel.com + +#include +#include +#include +#include +#include "ept_idle_common.h" + +#ifdef CONFIG_HUGETLB_PAGE +int pmd_huge(pmd_t pmd) +{ + return !pmd_none(pmd) && + (pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT; +} + +int pud_huge(pud_t pud) +{ + return !!(pud_val(pud) & _PAGE_PSE); +} + +/* + * ept_idle_huge_pte_offset() - Walk the page table to resolve the hugepage + * entry at address @addr + * + * Return: Pointer to page table or swap entry (PUD or PMD) for + * address @addr, or NULL if a p*d_none() entry is encountered and the + * size @sz doesn't match the hugepage size at this level of the page + * table. + */ +pte_t *ept_idle_huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + return NULL; + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) + return NULL; + + pud = pud_offset(p4d, addr); + if (sz != PUD_SIZE && pud_none(*pud)) + return NULL; + /* hugepage or swap? */ + if (pud_huge(*pud) || !pud_present(*pud)) + return (pte_t *)pud; + + pmd = pmd_offset(pud, addr); + if (sz != PMD_SIZE && pmd_none(*pmd)) + return NULL; + /* hugepage or swap? */ + if (pmd_huge(*pmd) || !pmd_present(*pmd)) + return (pte_t *)pmd; + + return NULL; +} + +#else // #ifdef CONFIG_HUGETLB_PAGE +#define pud_huge(x) 0 +#define pmd_huge(x) 0 +#define ept_idle_huge_pte_offset(mm, address, sz) 0 +#endif + +#ifndef VM_BUG_ON_VMA +#define VM_BUG_ON_VMA(cond, vma) \ + do { \ + if (unlikely(cond)) { \ + BUG(); \ + } \ + } while (0) + +#endif + + +#ifndef VM_BUG_ON_MM +#define VM_BUG_ON_MM VM_BUG_ON_VMA +#endif + +static inline int ept_idle_p4d_none_or_clear_bad(p4d_t *p4d) +{ + if (p4d_none(*p4d)) + return 1; + if (unlikely(p4d_bad(*p4d))) { + p4d_clear_bad(p4d); + return 1; + } + return 0; +} + + +static inline spinlock_t *ept_idle_pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) +{ + spinlock_t *ptl; + + VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma); + + ptl = pud_lock(vma->vm_mm, pud); + if (likely(pud_trans_huge(*pud) || pud_devmap(*pud))) + return ptl; + spin_unlock(ptl); + return NULL; +} + +void p4d_clear_bad(p4d_t *p4d) +{ + p4d_ERROR(*p4d); + p4d_clear(p4d); +} + +void pmd_clear_bad(pmd_t *pmd) +{ + pmd_ERROR(*pmd); + pmd_clear(pmd); +} + +#ifdef _EPT_IDLE_SPLIT_PMD_ +static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pte_t *pte; + int err = 0; + + pte = pte_offset_map(pmd, addr); + for (;;) { + err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk); + if (err) + break; + addr += PAGE_SIZE; + if (addr == end) + break; + pte++; + } + + pte_unmap(pte); + return err; +} +#endif + +static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pmd_t *pmd; + unsigned long next; + int err = 0; + + pmd = pmd_offset(pud, addr); + do { +#ifdef _EPT_IDLE_SPLIT_PMD_ + again: +#endif + next = pmd_addr_end(addr, end); + if (pmd_none(*pmd) || !walk->vma) { + if (walk->pte_hole) + err = walk->pte_hole(addr, next, walk); + if (err) + break; + continue; + } + /* + * This implies that each ->pmd_entry() handler + * needs to know about pmd_trans_huge() pmds + */ + if (walk->pmd_entry) + err = walk->pmd_entry(pmd, addr, next, walk); + if (err) + break; + +#ifdef _EPT_IDLE_SPLIT_PMD_ + /* + * Check this here so we only break down trans_huge + * pages when we _need_ to + */ + if (!walk->pte_entry) + continue; + + split_huge_pmd(walk->vma, pmd, addr); + if (pmd_trans_unstable(pmd)) + goto again; + + err = walk_pte_range(pmd, addr, next, walk); + if (err) + break; +#endif + } while (pmd++, addr = next, addr != end); + + return err; +} + +static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pud_t *pud; + unsigned long next; + int err = 0; + + pud = pud_offset(p4d, addr); + do { +#ifdef _EPT_IDLE_SPLIT_PUD_ + again: +#endif + next = pud_addr_end(addr, end); + if (pud_none(*pud) || !walk->vma) { + if (walk->pte_hole) + err = walk->pte_hole(addr, next, walk); + if (err) + break; + continue; + } + + if (walk->pud_entry) { + spinlock_t *ptl = ept_idle_pud_trans_huge_lock(pud, walk->vma); + + if (ptl) { + err = walk->pud_entry(pud, addr, next, walk); + spin_unlock(ptl); + if (err) + break; + continue; + } + } +#ifdef _EPT_IDLE_SPLIT_PUD_ + split_huge_pud(walk->vma, pud, addr); + if (pud_none(*pud)) + goto again; +#endif + + if (walk->pmd_entry || walk->pte_entry) + err = walk_pmd_range(pud, addr, next, walk); + if (err) + break; + + } while (pud++, addr = next, addr != end); + + return err; +} + +static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + p4d_t *p4d; + unsigned long next; + int err = 0; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (ept_idle_p4d_none_or_clear_bad(p4d)) { + if (walk->pte_hole) + err = walk->pte_hole(addr, next, walk); + if (err) + break; + continue; + } + if (walk->pmd_entry || walk->pte_entry) + err = walk_pud_range(p4d, addr, next, walk); + if (err) + break; + } while (p4d++, addr = next, addr != end); + + return err; +} + +static int walk_pgd_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pgd_t *pgd; + unsigned long next; + int err = 0; + + pgd = pgd_offset(walk->mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) { + if (walk->pte_hole) + err = walk->pte_hole(addr, next, walk); + if (err) + break; + continue; + } + if (walk->pmd_entry || walk->pte_entry) + err = walk_p4d_range(pgd, addr, next, walk); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + return err; +} + +#ifdef CONFIG_HUGETLB_PAGE +static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, + unsigned long end) +{ + unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); + return boundary < end ? boundary : end; +} + +static int walk_hugetlb_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + struct hstate *h = hstate_vma(vma); + unsigned long next; + unsigned long hmask = huge_page_mask(h); + unsigned long sz = huge_page_size(h); + pte_t *pte; + int err = 0; + + do { + next = hugetlb_entry_end(h, addr, end); + pte = ept_idle_huge_pte_offset(walk->mm, addr & hmask, sz); + + if (pte) + err = walk->hugetlb_entry(pte, hmask, addr, next, walk); + else if (walk->pte_hole) + err = walk->pte_hole(addr, next, walk); + + if (err) + break; + } while (addr = next, addr != end); + + return err; +} + +#else /* CONFIG_HUGETLB_PAGE */ +static int walk_hugetlb_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + return 0; +} + +#endif /* CONFIG_HUGETLB_PAGE */ + +/* + * Decide whether we really walk over the current vma on [@start, @end) + * or skip it via the returned value. Return 0 if we do walk over the + * current vma, and return 1 if we skip the vma. Negative values means + * error, where we abort the current walk. + */ +static int walk_page_test(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + + if (walk->test_walk) + return walk->test_walk(start, end, walk); + + /* + * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP + * range, so we don't walk over it as we do for normal vmas. However, + * Some callers are interested in handling hole range and they don't + * want to just ignore any single address range. Such users certainly + * define their ->pte_hole() callbacks, so let's delegate them to handle + * vma(VM_PFNMAP). + */ + if (vma->vm_flags & VM_PFNMAP) { + int err = 1; + if (walk->pte_hole) + err = walk->pte_hole(start, end, walk); + return err ? err : 1; + } + return 0; +} + +static int __walk_page_range(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + int err = 0; + struct vm_area_struct *vma = walk->vma; + + if (vma && is_vm_hugetlb_page(vma)) { + if (walk->hugetlb_entry) + err = walk_hugetlb_range(start, end, walk); + } else + err = walk_pgd_range(start, end, walk); + + return err; +} + +/** + * walk_page_range - walk page table with caller specific callbacks + * @start: start address of the virtual address range + * @end: end address of the virtual address range + * @walk: mm_walk structure defining the callbacks and the target address space + * + * Recursively walk the page table tree of the process represented by @walk->mm + * within the virtual address range [@start, @end). During walking, we can do + * some caller-specific works for each entry, by setting up pmd_entry(), + * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these + * callbacks, the associated entries/pages are just ignored. + * The return values of these callbacks are commonly defined like below: + * + * - 0 : succeeded to handle the current entry, and if you don't reach the + * end address yet, continue to walk. + * - >0 : succeeded to handle the current entry, and return to the caller + * with caller specific value. + * - <0 : failed to handle the current entry, and return to the caller + * with error code. + * + * Before starting to walk page table, some callers want to check whether + * they really want to walk over the current vma, typically by checking + * its vm_flags. walk_page_test() and @walk->test_walk() are used for this + * purpose. + * + * struct mm_walk keeps current values of some common data like vma and pmd, + * which are useful for the access from callbacks. If you want to pass some + * caller-specific data to callbacks, @walk->private should be helpful. + * + * Locking: + * Callers of walk_page_range() and walk_page_vma() should hold + * @walk->mm->mmap_sem, because these function traverse vma list and/or + * access to vma's data. + */ +int ept_idle_walk_page_range(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + int err = 0; + unsigned long next; + struct vm_area_struct *vma; + + if (start >= end) + return -EINVAL; + + if (!walk->mm) + return -EINVAL; + + VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); + + vma = find_vma(walk->mm, start); + do { + if (!vma) { /* after the last vma */ + walk->vma = NULL; + next = end; + } else if (start < vma->vm_start) { /* outside vma */ + walk->vma = NULL; + next = min(end, vma->vm_start); + } else { /* inside vma */ + walk->vma = vma; + next = min(end, vma->vm_end); + vma = vma->vm_next; + + err = walk_page_test(start, next, walk); + if (err > 0) { + /* + * positive return values are purely for + * controlling the pagewalk, so should never + * be passed to the callers. + */ + err = 0; + continue; + } + if (err < 0) + break; + } + if (walk->vma || walk->pte_hole) + err = __walk_page_range(start, next, walk); + if (err) + break; + } while (start = next, start < end); + return err; +} diff --git a/ept_idle_native_pagewalk.h b/ept_idle_native_pagewalk.h new file mode 100644 index 0000000..42d07b1 --- /dev/null +++ b/ept_idle_native_pagewalk.h @@ -0,0 +1,7 @@ +#ifndef _EPT_IDLE_NATIVE_PAGEWALK_H +#define _EPT_IDLE_NATIVE_PAGEWALK_H + +int ept_idle_walk_page_range(unsigned long start, unsigned long end, + struct mm_walk *walk); + +#endif diff --git a/tlb_flush.c b/tlb_flush.c new file mode 100644 index 0000000..974ec41 --- /dev/null +++ b/tlb_flush.c @@ -0,0 +1,288 @@ +#include "tlb_flush.h" + + +/* copied from 4.20 kernel: + * See Documentation/x86/tlb.txt for details. We choose 33 + * because it is large enough to cover the vast majority (at + * least 95%) of allocations, and is small enough that we are + * confident it will not cause too much overhead. Each single + * flush is about 100 ns, so this caps the maximum overhead at + * _about_ 3,000 ns. + * + * This is in units of pages. + */ +static unsigned long copied_tlb_single_page_flush_ceiling __read_mostly = 33; + + +static bool copied_tlb_is_not_lazy(int cpu, void *data) +{ + return !per_cpu(cpu_tlbstate.is_lazy, cpu); +} + + +/* + * flush_tlb_func_common()'s memory ordering requirement is that any + * TLB fills that happen after we flush the TLB are ordered after we + * read active_mm's tlb_gen. We don't need any explicit barriers + * because all x86 flush operations are serializing and the + * atomic64_read operation won't be reordered by the compiler. + */ +static void copied_flush_tlb_func_common(const struct flush_tlb_info *f, + bool local, enum tlb_flush_reason reason) +{ + /* + * We have three different tlb_gen values in here. They are: + * + * - mm_tlb_gen: the latest generation. + * - local_tlb_gen: the generation that this CPU has already caught + * up to. + * - f->new_tlb_gen: the generation that the requester of the flush + * wants us to catch up to. + */ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); + u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); + + /* This code cannot presently handle being reentered. */ + VM_WARN_ON(!irqs_disabled()); + + /* + * The init_mm is unexported variable, but we don't need + * check this here for our case, we just want to flush + * the TLB on remote CPU cores which is running the task + * using f->mm as memory space + */ +#if 0 + if (unlikely(loaded_mm == &init_mm)) + return; +#else + if (unlikely(loaded_mm != f->mm)) { + return; + } +#endif + + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != + loaded_mm->context.ctx_id); + + /* + * The caller of this function will set is_lazy to false explicitly + * so we don't need handle this case, just skip this. + */ + #if 0 + if (this_cpu_read(cpu_tlbstate.is_lazy)) { + /* + * We're in lazy mode. We need to at least flush our + * paging-structure cache to avoid speculatively reading + * garbage into our TLB. Since switching to init_mm is barely + * slower than a minimal flush, just switch to init_mm. + * + * This should be rare, with native_flush_tlb_others skipping + * IPIs to lazy TLB mode CPUs. + */ + switch_mm_irqs_off(NULL, &init_mm, NULL); + return; + } +#endif + + if (unlikely(local_tlb_gen == mm_tlb_gen)) { + /* + * There's nothing to do: we're already up to date. This can + * happen if two concurrent flushes happen -- the first flush to + * be handled can catch us all the way up, leaving no work for + * the second flush. + */ + // trace_tlb_flush(reason, 0); + return; + } + + WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); + WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen); + + /* + * If we get to this point, we know that our TLB is out of date. + * This does not strictly imply that we need to flush (it's + * possible that f->new_tlb_gen <= local_tlb_gen), but we're + * going to need to flush in the very near future, so we might + * as well get it over with. + * + * The only question is whether to do a full or partial flush. + * + * We do a partial flush if requested and two extra conditions + * are met: + * + * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that + * we've always done all needed flushes to catch up to + * local_tlb_gen. If, for example, local_tlb_gen == 2 and + * f->new_tlb_gen == 3, then we know that the flush needed to bring + * us up to date for tlb_gen 3 is the partial flush we're + * processing. + * + * As an example of why this check is needed, suppose that there + * are two concurrent flushes. The first is a full flush that + * changes context.tlb_gen from 1 to 2. The second is a partial + * flush that changes context.tlb_gen from 2 to 3. If they get + * processed on this CPU in reverse order, we'll see + * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. + * If we were to use __flush_tlb_one_user() and set local_tlb_gen to + * 3, we'd be break the invariant: we'd update local_tlb_gen above + * 1 without the full flush that's needed for tlb_gen 2. + * + * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. + * Partial TLB flushes are not all that much cheaper than full TLB + * flushes, so it seems unlikely that it would be a performance win + * to do a partial flush if that won't bring our TLB fully up to + * date. By doing a full flush instead, we can increase + * local_tlb_gen all the way to mm_tlb_gen and we can probably + * avoid another flush in the very near future. + */ + if (f->end != TLB_FLUSH_ALL && + f->new_tlb_gen == local_tlb_gen + 1 && + f->new_tlb_gen == mm_tlb_gen) { + /* Partial flush */ + unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift; + unsigned long addr = f->start; + + while (addr < f->end) { + __flush_tlb_one_user(addr); + addr += 1UL << f->stride_shift; + } + if (local) + count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate); + // trace_tlb_flush(reason, nr_invalidate); + } else { + /* Full flush. */ + local_flush_tlb(); + if (local) + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + // trace_tlb_flush(reason, TLB_FLUSH_ALL); + } + + /* Both paths above update our state to mm_tlb_gen. */ + this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); +} + +static void copied_flush_tlb_func_remote(void *info) +{ + const struct flush_tlb_info *f = info; + bool saved_lazy; + + inc_irq_stat(irq_tlb_count); + + if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm)) + return; + + saved_lazy = this_cpu_read(cpu_tlbstate.is_lazy); + this_cpu_write(cpu_tlbstate.is_lazy, false); + + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + copied_flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN); + + this_cpu_write(cpu_tlbstate.is_lazy, saved_lazy); +} + + +static void copied_native_flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info) +{ + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); + +#if 0 + if (info->end == TLB_FLUSH_ALL) + trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); + else + trace_tlb_flush(TLB_REMOTE_SEND_IPI, + (info->end - info->start) >> PAGE_SHIFT); +#endif + /* + * Use non-UV system way in first version to reduce porting affort, + * we will support UV system later if necessary + */ +#if 0 + if (is_uv_system()) { + /* + * This whole special case is confused. UV has a "Broadcast + * Assist Unit", which seems to be a fancy way to send IPIs. + * Back when x86 used an explicit TLB flush IPI, UV was + * optimized to use its own mechanism. These days, x86 uses + * smp_call_function_many(), but UV still uses a manual IPI, + * and that IPI's action is out of date -- it does a manual + * flush instead of calling flush_tlb_func_remote(). This + * means that the percpu tlb_gen variables won't be updated + * and we'll do pointless flushes on future context switches. + * + * Rather than hooking native_flush_tlb_others() here, I think + * that UV should be updated so that smp_call_function_many(), + * etc, are optimal on UV. + */ + unsigned int cpu; + + cpu = smp_processor_id(); + cpumask = uv_flush_tlb_others(cpumask, info); + if (cpumask) + smp_call_function_many(cpumask, copied_flush_tlb_func_remote, + (void *)info, 1); + return; + } +#endif + + /* + * If no page tables were freed, we can skip sending IPIs to + * CPUs in lazy TLB mode. They will flush the CPU themselves + * at the next context switch. + * + * However, if page tables are getting freed, we need to send the + * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping + * up on the new contents of what used to be page tables, while + * doing a speculative memory access. + */ + if (info->freed_tables) + smp_call_function_many(cpumask, copied_flush_tlb_func_remote, + (void *)info, 1); + else + on_each_cpu_cond_mask(copied_tlb_is_not_lazy, copied_flush_tlb_func_remote, + (void *)info, 1, GFP_ATOMIC, cpumask); +} + + +void copied_flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, + unsigned long end, unsigned int stride_shift, + bool freed_tables) +{ + int cpu; + + struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = { + .mm = mm, + .stride_shift = stride_shift, + .freed_tables = freed_tables, + }; + + cpu = get_cpu(); + + /* This is also a barrier that synchronizes with switch_mm(). */ + info.new_tlb_gen = inc_mm_tlb_gen(mm); + + /* Should we flush just the requested range? */ + if ((end != TLB_FLUSH_ALL) && + ((end - start) >> stride_shift) <= copied_tlb_single_page_flush_ceiling) { + info.start = start; + info.end = end; + } else { + info.start = 0UL; + info.end = TLB_FLUSH_ALL; + } + + /* This should never happend in our case */ + if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { + VM_WARN_ON(irqs_disabled()); + local_irq_disable(); + copied_flush_tlb_func_common(&info, true, TLB_LOCAL_MM_SHOOTDOWN); + local_irq_enable(); + } + + if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) + copied_native_flush_tlb_others(mm_cpumask(mm), &info); + + put_cpu(); +} + diff --git a/tlb_flush.h b/tlb_flush.h new file mode 100644 index 0000000..ca24adf --- /dev/null +++ b/tlb_flush.h @@ -0,0 +1,10 @@ +#ifndef _TLB_FLUSH_H +#define _TLB_FLUSH_H + +#include + +void copied_flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, + unsigned long end, unsigned int stride_shift, + bool freed_tables); + +#endif -- GitLab