From aee7326bb297102b24bed3dfbb5ba51bca3527e9 Mon Sep 17 00:00:00 2001
From: yanxiaodan <yanxiaodan@huawei.com>
Date: Mon, 10 Aug 2020 20:01:43 +0800
Subject: [PATCH] memory-scan initialization, derived from memory-optimizer
 which is located in
 https://github.com/intel/memory-optimizer/tree/master/kernel_module

---
 Kbuild                     |    2 +
 Makefile                   |   11 +
 README.en.md               |   17 +-
 README.md                  |    7 +-
 ept_idle.c                 | 1038 ++++++++++++++++++++++++++++++++++++
 ept_idle.h                 |  123 +++++
 ept_idle_common.h          |   33 ++
 ept_idle_native_pagewalk.c |  465 ++++++++++++++++
 ept_idle_native_pagewalk.h |    7 +
 tlb_flush.c                |  288 ++++++++++
 tlb_flush.h                |   10 +
 11 files changed, 1983 insertions(+), 18 deletions(-)
 create mode 100644 Kbuild
 create mode 100644 Makefile
 create mode 100644 ept_idle.c
 create mode 100644 ept_idle.h
 create mode 100644 ept_idle_common.h
 create mode 100644 ept_idle_native_pagewalk.c
 create mode 100644 ept_idle_native_pagewalk.h
 create mode 100644 tlb_flush.c
 create mode 100644 tlb_flush.h

diff --git a/Kbuild b/Kbuild
new file mode 100644
index 0000000..927fb51
--- /dev/null
+++ b/Kbuild
@@ -0,0 +1,2 @@
+obj-m := kvm_ept_idle.o
+kvm_ept_idle-y := ept_idle.o ept_idle_native_pagewalk.o tlb_flush.o
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..0ba5bb8
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,11 @@
+include Kbuild
+
+# KERNEL_SRC_DIR=/lib/modules/$(shell uname -r)/build
+# MODULE_DIR=$(shell pwd)
+
+default:
+	$(MAKE) -C $(KERNEL_SRC_DIR) M=$(MODULE_DIR) modules
+
+clean:
+	rm -f *.o *.ko *.mod.c modules.order Module.symvers
+
diff --git a/README.en.md b/README.en.md
index 8be96a1..dfdb595 100644
--- a/README.en.md
+++ b/README.en.md
@@ -1,22 +1,13 @@
 # memory-scan
 
 #### Description
-A kernel module for scaning page table of process/VMs
-
-#### Software Architecture
-Software architecture description
+memory-scan is derived from memory-optimizer which is located in https://github.com/intel/memory-optimizer/tree/master/kernel_module. memory-scan is a kernel module for scaning page table of process/VMs.
 
 #### Installation
 
-1.  xxxx
-2.  xxxx
-3.  xxxx
-
-#### Instructions
-
-1.  xxxx
-2.  xxxx
-3.  xxxx
+OpenEuler users can use memory-scan:
+1.  make
+2.  insmod memory_scan.ko
 
 #### Contribution
 
diff --git a/README.md b/README.md
index 5e04446..4a99cc7 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,8 @@
 # memory-scan
 
 #### 介绍
-A kernel module for scaning page table of process/VMs
-
-#### 软件架构
-软件架构说明
-
+memory-scan is derived from memory-optimizer which is located in https://github.com/intel/memory-optimizer/tree/master/kernel
+_module. memory-scan is a kernel module for scaning page table of process/VMs.
 
 #### 安装教程
 
diff --git a/ept_idle.c b/ept_idle.c
new file mode 100644
index 0000000..ba77c20
--- /dev/null
+++ b/ept_idle.c
@@ -0,0 +1,1038 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/proc_fs.h>
+#include <linux/uaccess.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/bitmap.h>
+#include <linux/sched/mm.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <asm/tlbflush.h>
+#include <linux/fdtable.h>
+#include "ept_idle.h"
+#include "ept_idle_native_pagewalk.h"
+#include "tlb_flush.h"
+
+/* #define DEBUG 1 */
+
+/*
+   Fallback to false for kernel doens't support KVM_INVALID_SPTE
+   ept_idle can sitll work in this situation but the scan accuracy may drop, depends on
+   the access frequences of the workload.
+*/
+#ifdef KVM_INVALID_SPTE
+  #define KVM_CHECK_INVALID_SPTE(val) (val) == KVM_INVALID_SPTE
+#else
+  #define KVM_CHECK_INVALID_SPTE(val) (0)
+#endif
+
+
+#if LINUX_VERSION_CODE == KERNEL_VERSION(4, 17, 0)
+# define pgtable_l5_enabled() (pgtable_l5_enabled)
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0)
+# define pgtable_l5_enabled() (0)
+#endif
+
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 20, 0)
+# define kvm_arch_mmu_pointer(vcpu) (vcpu->arch.mmu)
+/*For RedHat 7.7 beta*/
+#elif LINUX_VERSION_CODE == KERNEL_VERSION(3, 10, 0)
+# define kvm_arch_mmu_pointer(vcpu) (vcpu->arch.mmu)
+#else
+# define kvm_arch_mmu_pointer(vcpu) (&vcpu->arch.mmu)
+#endif
+
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 20, 0)
+# define kvm_mmu_ad_disabled(mmu) (mmu->mmu_role.base.ad_disabled)
+/*For RedHat 7.7 beta*/
+#elif LINUX_VERSION_CODE == KERNEL_VERSION(3, 10, 0)
+# define kvm_mmu_ad_disabled(mmu) (mmu->mmu_role.base.ad_disabled)
+#else
+# define kvm_mmu_ad_disabled(mmu) (mmu->base_role.ad_disabled)
+#endif
+
+#ifdef DEBUG
+
+#define debug_printk trace_printk
+
+#define set_restart_gpa(val, note)	({			\
+	unsigned long old_val = eic->restart_gpa;		\
+	eic->restart_gpa = (val);				\
+	trace_printk("restart_gpa=%lx %luK  %s  %s %d\n",	\
+		     (val), (eic->restart_gpa - old_val) >> 10,	\
+		     note, __func__, __LINE__);			\
+})
+
+#define set_next_hva(val, note)	({				\
+	unsigned long old_val = eic->next_hva;			\
+	eic->next_hva = (val);					\
+	trace_printk("   next_hva=%lx %luK  %s  %s %d\n",	\
+		     (val), (eic->next_hva - old_val) >> 10,	\
+		     note, __func__, __LINE__);			\
+})
+
+#else
+
+#define debug_printk(...)
+
+#define set_restart_gpa(val, note)	({			\
+	eic->restart_gpa = (val);				\
+})
+
+#define set_next_hva(val, note)	({				\
+	eic->next_hva = (val);					\
+})
+
+#endif
+
+static struct proc_dir_entry* dir_entry;
+
+static unsigned long pagetype_size[16] = {
+	[PTE_ACCESSED]	= PAGE_SIZE,	/* 4k page */
+	[PMD_ACCESSED]	= PMD_SIZE,	/* 2M page */
+	[PUD_PRESENT]	= PUD_SIZE,	/* 1G page */
+
+	[PTE_DIRTY]	= PAGE_SIZE,
+	[PMD_DIRTY]	= PMD_SIZE,
+
+	[PTE_IDLE]	= PAGE_SIZE,
+	[PMD_IDLE]	= PMD_SIZE,
+	[PMD_IDLE_PTES] = PMD_SIZE,
+
+	[PTE_HOLE]	= PAGE_SIZE,
+	[PMD_HOLE]	= PMD_SIZE,
+};
+
+static void u64_to_u8(uint64_t n, uint8_t *p)
+{
+	p += sizeof(uint64_t) - 1;
+
+	*p-- = n; n >>= 8;
+	*p-- = n; n >>= 8;
+	*p-- = n; n >>= 8;
+	*p-- = n; n >>= 8;
+
+	*p-- = n; n >>= 8;
+	*p-- = n; n >>= 8;
+	*p-- = n; n >>= 8;
+	*p   = n;
+}
+
+static void dump_eic(struct ept_idle_ctrl *eic)
+{
+	debug_printk("ept_idle_ctrl: pie_read=%d pie_read_max=%d buf_size=%d "
+		     "bytes_copied=%d next_hva=%lx restart_gpa=%lx "
+		     "gpa_to_hva=%lx\n",
+		     eic->pie_read,
+		     eic->pie_read_max,
+		     eic->buf_size,
+		     eic->bytes_copied,
+		     eic->next_hva,
+		     eic->restart_gpa,
+		     eic->gpa_to_hva);
+}
+
+static void eic_report_addr(struct ept_idle_ctrl *eic, unsigned long addr)
+{
+	unsigned long hva;
+	eic->kpie[eic->pie_read++] = PIP_CMD_SET_HVA;
+	hva = addr;
+	u64_to_u8(hva, &eic->kpie[eic->pie_read]);
+	eic->pie_read += sizeof(uint64_t);
+	debug_printk("eic_report_addr %lx\n", addr);
+	dump_eic(eic);
+}
+
+static int eic_add_page(struct ept_idle_ctrl *eic,
+			unsigned long addr,
+			unsigned long next,
+			enum ProcIdlePageType page_type)
+{
+	int page_size = pagetype_size[page_type];
+
+	debug_printk("eic_add_page addr=%lx next=%lx "
+		     "page_type=%d pagesize=%dK\n",
+		     addr, next, (int)page_type, (int)page_size >> 10);
+	dump_eic(eic);
+
+	/* align kernel/user vision of cursor position */
+	next = round_up(next, page_size);
+
+	if (!eic->pie_read ||
+	    addr + eic->gpa_to_hva != eic->next_hva) {
+		/* merge hole */
+		if (page_type == PTE_HOLE ||
+		    page_type == PMD_HOLE) {
+			set_restart_gpa(next, "PTE_HOLE|PMD_HOLE");
+			return 0;
+		}
+
+		if (addr + eic->gpa_to_hva < eic->next_hva) {
+			debug_printk("ept_idle: addr moves backwards\n");
+			WARN_ONCE(1, "ept_idle: addr moves backwards");
+		}
+
+		if (eic->pie_read + sizeof(uint64_t) + 2 >= eic->pie_read_max) {
+			set_restart_gpa(addr, "EPT_IDLE_KBUF_FULL");
+			return EPT_IDLE_KBUF_FULL;
+		}
+
+		eic_report_addr(eic, round_down(addr, page_size) +
+							eic->gpa_to_hva);
+	} else {
+		if (PIP_TYPE(eic->kpie[eic->pie_read - 1]) == page_type &&
+		    PIP_SIZE(eic->kpie[eic->pie_read - 1]) < 0xF) {
+			set_next_hva(next + eic->gpa_to_hva, "IN-PLACE INC");
+			set_restart_gpa(next, "IN-PLACE INC");
+			eic->kpie[eic->pie_read - 1]++;
+			WARN_ONCE(page_size < next-addr, "next-addr too large");
+			return 0;
+		}
+		if (eic->pie_read >= eic->pie_read_max) {
+			set_restart_gpa(addr, "EPT_IDLE_KBUF_FULL");
+			return EPT_IDLE_KBUF_FULL;
+		}
+	}
+
+	set_next_hva(next + eic->gpa_to_hva, "NEW-ITEM");
+	set_restart_gpa(next, "NEW-ITEM");
+	eic->kpie[eic->pie_read] = PIP_COMPOSE(page_type, 1);
+	eic->pie_read++;
+
+	return 0;
+}
+
+// Borrowed fronm zhou, jianshi <jianshi.zhou@intel.com> and modified by yy, thanks to jianshi.
+static int get_mm_and_kvm_by_pid(pid_t nr,
+                                 struct mm_struct** mmp,
+                                 struct kvm** kvmp)
+{
+	struct task_struct* task;
+	struct files_struct* files;
+	struct kvm* kvm = NULL;
+	struct mm_struct* mm = NULL;
+	struct pid* pid;
+	int fd, max_fds;
+
+	rcu_read_lock();
+
+	if(!(pid = find_vpid(nr))) {
+		rcu_read_unlock();
+		printk(KERN_ERR"failed to get vpid for pid = %d\n", nr);
+		return -ESRCH;
+	}
+
+	if(!(task = pid_task(pid, PIDTYPE_PID))){
+		rcu_read_unlock();
+		printk(KERN_ERR"failed to get task_struct for pid = %d\n", nr);
+		return -ESRCH;
+	}
+
+	// kthread has no mm_struct*
+	mm = get_task_mm(task);
+	if (!mm) {
+		rcu_read_unlock();
+		printk(KERN_ERR"faild to get mm_struct for pid = %d\n", nr);
+		return -ESRCH;
+	}
+
+	files = task->files;
+	max_fds = files_fdtable(files)->max_fds;
+	for(fd = 0; fd < max_fds; fd++) {
+		struct file* file;
+		char buffer[32];
+		char* fname;
+
+		if(!(file = fcheck_files(files, fd)))
+			continue;
+
+		fname = d_path(&(file->f_path), buffer, sizeof(buffer));
+		if(fname < buffer || fname >= buffer + sizeof(buffer))
+			continue;
+
+		if(strcmp(fname, "anon_inode:kvm-vm") == 0) {
+			kvm = file->private_data;
+			if (kvm)
+				kvm_get_kvm(kvm);
+            break;
+		}
+	}
+
+	rcu_read_unlock();
+	*kvmp = kvm;
+	*mmp = mm;
+
+    return 0;
+}
+
+
+static int ept_pte_range(struct ept_idle_ctrl *eic,
+			 pmd_t *pmd, unsigned long addr, unsigned long end)
+{
+	pte_t *pte;
+	enum ProcIdlePageType page_type;
+	int err = 0;
+
+	pte = pte_offset_kernel(pmd, addr);
+	do {
+		if (KVM_CHECK_INVALID_SPTE(pte->pte)) {
+			page_type = PTE_IDLE;
+		} else if (!ept_pte_present(*pte))
+			page_type = PTE_HOLE;
+		else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED,
+					     (unsigned long *) &pte->pte))
+			page_type = PTE_IDLE;
+		else {
+			page_type = PTE_ACCESSED;
+			if (eic->flags & SCAN_DIRTY_PAGE) {
+				if (test_and_clear_bit(_PAGE_BIT_EPT_DIRTY,
+						(unsigned long *) &pte->pte))
+					page_type = PTE_DIRTY;
+			}
+		}
+
+		err = eic_add_page(eic, addr, addr + PAGE_SIZE, page_type);
+		if (err)
+			break;
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+
+	return err;
+}
+
+static int ept_pmd_range(struct ept_idle_ctrl *eic,
+			 pud_t *pud, unsigned long addr, unsigned long end)
+{
+	pmd_t *pmd;
+	unsigned long next;
+	enum ProcIdlePageType page_type;
+	enum ProcIdlePageType pte_page_type;
+	int err = 0;
+
+	if (eic->flags & SCAN_HUGE_PAGE)
+		pte_page_type = PMD_IDLE_PTES;
+	else
+		pte_page_type = IDLE_PAGE_TYPE_MAX;
+
+	pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (KVM_CHECK_INVALID_SPTE(pmd->pmd)) {
+			page_type = PMD_IDLE;
+		} else if (!ept_pmd_present(*pmd))
+			page_type = PMD_HOLE;	/* likely won't hit here */
+		else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED,
+					     (unsigned long *)pmd)) {
+			if (pmd_large(*pmd))
+				page_type = PMD_IDLE;
+			else if (eic->flags & SCAN_SKIM_IDLE)
+				page_type = PMD_IDLE_PTES;
+			else
+				page_type = pte_page_type;
+		} else if (pmd_large(*pmd)) {
+			page_type = PMD_ACCESSED;
+			if (eic->flags & SCAN_DIRTY_PAGE) {
+				if (test_and_clear_bit(_PAGE_BIT_EPT_DIRTY,
+						(unsigned long *) pmd))
+					page_type = PMD_DIRTY;
+			}
+
+		} else
+			page_type = pte_page_type;
+
+		if (page_type != IDLE_PAGE_TYPE_MAX)
+			err = eic_add_page(eic, addr, next, page_type);
+		else
+			err = ept_pte_range(eic, pmd, addr, next);
+		if (err)
+			break;
+	} while (pmd++, addr = next, addr != end);
+
+	return err;
+}
+
+static int ept_pud_range(struct ept_idle_ctrl *eic,
+			 p4d_t *p4d, unsigned long addr, unsigned long end)
+{
+	pud_t *pud;
+	unsigned long next;
+	int err = 0;
+
+	pud = pud_offset(p4d, addr);
+	do {
+		next = pud_addr_end(addr, end);
+
+		if (!ept_pud_present(*pud)) {
+			set_restart_gpa(next, "PUD_HOLE");
+			continue;
+		}
+
+		if (pud_large(*pud))
+			err = eic_add_page(eic, addr, next, PUD_PRESENT);
+		else
+			err = ept_pmd_range(eic, pud, addr, next);
+
+		if (err)
+			break;
+	} while (pud++, addr = next, addr != end);
+
+	return err;
+}
+
+static int ept_p4d_range(struct ept_idle_ctrl *eic,
+			 pgd_t *pgd, unsigned long addr, unsigned long end)
+{
+	p4d_t *p4d;
+	unsigned long next;
+	int err = 0;
+
+	p4d = p4d_offset(pgd, addr);
+	do {
+		next = p4d_addr_end(addr, end);
+		if (!ept_p4d_present(*p4d)) {
+			set_restart_gpa(next, "P4D_HOLE");
+			continue;
+		}
+
+		err = ept_pud_range(eic, p4d, addr, next);
+		if (err)
+			break;
+	} while (p4d++, addr = next, addr != end);
+
+	return err;
+}
+
+static int ept_page_range(struct ept_idle_ctrl *eic,
+			  unsigned long addr,
+			  unsigned long end)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_mmu *mmu;
+	pgd_t *ept_root;
+	pgd_t *pgd;
+	unsigned long next;
+	int err = 0;
+
+	BUG_ON(addr >= end);
+
+	spin_lock(&eic->kvm->mmu_lock);
+
+	vcpu = kvm_get_vcpu(eic->kvm, 0);
+	if (!vcpu) {
+		spin_unlock(&eic->kvm->mmu_lock);
+		return -EINVAL;
+	}
+
+	mmu = kvm_arch_mmu_pointer(vcpu);
+	if (!VALID_PAGE(mmu->root_hpa)) {
+		spin_unlock(&eic->kvm->mmu_lock);
+		return -EINVAL;
+	}
+
+	ept_root = __va(mmu->root_hpa);
+
+	spin_unlock(&eic->kvm->mmu_lock);
+	local_irq_disable();
+	pgd = pgd_offset_pgd(ept_root, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (!ept_pgd_present(*pgd)) {
+			set_restart_gpa(next, "PGD_HOLE");
+			continue;
+		}
+
+		err = ept_p4d_range(eic, pgd, addr, next);
+		if (err)
+			break;
+	} while (pgd++, addr = next, addr != end);
+	local_irq_enable();
+	return err;
+}
+
+static int init_ept_idle_ctrl_buffer(struct ept_idle_ctrl *eic)
+{
+	eic->pie_read = 0;
+	eic->pie_read_max = min(EPT_IDLE_KBUF_SIZE,
+				eic->buf_size - eic->bytes_copied);
+	/* reserve space for PIP_CMD_SET_HVA in the end */
+	eic->pie_read_max -= sizeof(uint64_t) + 1;
+
+	/*
+	* Align with EPT_IDLE_KBUF_FULL
+	* logic in eic_add_page(), to avoid eic->pie_read = 0 when
+	* EPT_IDLE_KBUF_FULL happened.
+	*/
+	if (eic->pie_read_max <= sizeof(uint64_t) + 2)
+		return EPT_IDLE_KBUF_FULL;
+
+	memset(eic->kpie, 0, sizeof(eic->kpie));
+	return 0;
+}
+
+static void setup_ept_idle_ctrl(struct ept_idle_ctrl *eic, void* buf,
+                                int buf_size, unsigned int flags)
+{
+	eic->buf = buf;
+	eic->buf_size = buf_size;
+	eic->bytes_copied = 0;
+	eic->next_hva = 0;
+	eic->gpa_to_hva = 0;
+	eic->restart_gpa = 0;
+	eic->last_va = 0;
+	eic->flags = flags;
+}
+
+static int ept_idle_copy_user(struct ept_idle_ctrl *eic,
+			      unsigned long start, unsigned long end)
+{
+	int bytes_read;
+	int lc = 0;	/* last copy? */
+	int ret;
+
+	debug_printk("ept_idle_copy_user %lx %lx\n", start, end);
+	dump_eic(eic);
+
+	/* Break out of loop on no more progress. */
+	if (!eic->pie_read) {
+		lc = 1;
+		if (start < end)
+			start = end;
+	}
+
+	if (start >= end && start > eic->next_hva) {
+		set_next_hva(start, "TAIL-HOLE");
+		eic_report_addr(eic, start);
+	}
+
+	bytes_read = eic->pie_read;
+	if (!bytes_read)
+		return 1;
+
+	ret = copy_to_user(eic->buf, eic->kpie, bytes_read);
+	if (ret)
+		return -EFAULT;
+
+	eic->buf += bytes_read;
+	eic->bytes_copied += bytes_read;
+	if (eic->bytes_copied >= eic->buf_size)
+		return EPT_IDLE_BUF_FULL;
+	if (lc)
+		return lc;
+	ret = init_ept_idle_ctrl_buffer(eic);
+	if (ret)
+		return ret;
+
+	cond_resched();
+	return 0;
+}
+
+/*
+ * Depending on whether hva falls in a memslot:
+ *
+ * 1) found => return gpa and remaining memslot size in *addr_range
+ *
+ *                 |<----- addr_range --------->|
+ *         [               mem slot             ]
+ *                 ^hva
+ *
+ * 2) not found => return hole size in *addr_range
+ *
+ *                 |<----- addr_range --------->|
+ *                                              [   first mem slot above hva  ]
+ *                 ^hva
+ *
+ * If hva is above all mem slots, *addr_range will be ~0UL. We can finish read(2).
+ */
+static unsigned long ept_idle_find_gpa(struct ept_idle_ctrl *eic,
+				       unsigned long hva,
+				       unsigned long *addr_range)
+{
+	struct kvm *kvm = eic->kvm;
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	unsigned long hva_end;
+	gfn_t gfn;
+
+	*addr_range = ~0UL;
+	mutex_lock(&kvm->slots_lock);
+	slots = kvm_memslots(eic->kvm);
+	kvm_for_each_memslot(memslot, slots) {
+		hva_end = memslot->userspace_addr +
+		    (memslot->npages << PAGE_SHIFT);
+
+		if (hva >= memslot->userspace_addr && hva < hva_end) {
+			gpa_t gpa;
+			gfn = hva_to_gfn_memslot(hva, memslot);
+			*addr_range = hva_end - hva;
+			gpa = gfn_to_gpa(gfn);
+			debug_printk("ept_idle_find_gpa slot %lx=>%llx %lx=>%llx "
+				     "delta %llx size %lx\n",
+				     memslot->userspace_addr,
+				     gfn_to_gpa(memslot->base_gfn),
+				     hva, gpa,
+				     hva - gpa,
+				     memslot->npages << PAGE_SHIFT);
+			mutex_unlock(&kvm->slots_lock);
+			return gpa;
+		}
+
+		if (memslot->userspace_addr > hva)
+			*addr_range = min(*addr_range,
+					  memslot->userspace_addr - hva);
+	}
+	mutex_unlock(&kvm->slots_lock);
+	return INVALID_PAGE;
+}
+
+static int ept_idle_supports_cpu(struct kvm *kvm)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_mmu *mmu;
+	int ret;
+
+	vcpu = kvm_get_vcpu(kvm, 0);
+	if (!vcpu)
+		return -EINVAL;
+
+	spin_lock(&kvm->mmu_lock);
+	mmu = kvm_arch_mmu_pointer(vcpu);
+	if (kvm_mmu_ad_disabled(mmu)) {
+		printk(KERN_NOTICE
+		       "CPU does not support EPT A/D bits tracking\n");
+		ret = -EINVAL;
+	} else if (mmu->shadow_root_level != 4 + (! !pgtable_l5_enabled())) {
+		printk(KERN_NOTICE "Unsupported EPT level %d\n",
+		       mmu->shadow_root_level);
+		ret = -EINVAL;
+	} else
+		ret = 0;
+	spin_unlock(&kvm->mmu_lock);
+
+	return ret;
+}
+
+static int ept_idle_walk_hva_range(struct ept_idle_ctrl *eic,
+				   unsigned long start, unsigned long end)
+{
+	unsigned long gpa_addr;
+	unsigned long addr_range;
+	unsigned long va_end;
+	int ret;
+
+	ret = ept_idle_supports_cpu(eic->kvm);
+	if (ret)
+		return ret;
+
+	ret = init_ept_idle_ctrl_buffer(eic);
+	if (ret)
+		return ret;
+
+	for (; start < end;) {
+		gpa_addr = ept_idle_find_gpa(eic, start, &addr_range);
+
+		if (gpa_addr == INVALID_PAGE) {
+			eic->gpa_to_hva = 0;
+			if (addr_range == ~0UL) /* beyond max virtual address */ {
+				set_restart_gpa(TASK_SIZE, "EOF");
+				va_end = end;
+			} else {
+				start += addr_range;
+				set_restart_gpa(start, "OUT-OF-SLOT");
+				va_end = start;
+			}
+		} else {
+			eic->gpa_to_hva = start - gpa_addr;
+			ept_page_range(eic, gpa_addr, gpa_addr + addr_range);
+			va_end = eic->gpa_to_hva + gpa_addr + addr_range;
+		}
+
+		start = eic->restart_gpa + eic->gpa_to_hva;
+		ret = ept_idle_copy_user(eic, start, va_end);
+		if (ret)
+			break;
+	}
+
+	if (eic->bytes_copied)
+		ret = 0;
+	return ret;
+}
+
+static ssize_t mm_idle_read(struct file *file, char *buf,
+			    size_t count, loff_t *ppos);
+
+static ssize_t ept_idle_read(struct file *file, char *buf,
+			     size_t count, loff_t *ppos)
+{
+	struct ept_idle_ctrl *eic = file->private_data;
+	unsigned long hva_start = *ppos;
+	unsigned long hva_end = hva_start + (count << (3 + PAGE_SHIFT));
+	int ret;
+
+	if (!eic) {
+		printk(KERN_ERR"NULL eic instance\n");
+		return -ENOMEM;
+	}
+
+	if (hva_start >= TASK_SIZE) {
+		debug_printk("ept_idle_read past TASK_SIZE: %lx %lx\n",
+			     hva_start, TASK_SIZE);
+		return 0;
+	}
+
+	if (!eic->mm)
+		return -EINVAL;
+
+	if (!eic->kvm)
+		return mm_idle_read(file, buf, count, ppos);
+
+	if (hva_end <= hva_start) {
+		debug_printk("ept_idle_read past EOF: %lx %lx\n",
+			     hva_start, hva_end);
+		return 0;
+	}
+	if (*ppos & (PAGE_SIZE - 1)) {
+		debug_printk("ept_idle_read unaligned ppos: %lx\n",
+			     hva_start);
+		return -EINVAL;
+	}
+	if (count < EPT_IDLE_BUF_MIN) {
+		debug_printk("ept_idle_read small count: %lx\n",
+			     (unsigned long)count);
+		return -EINVAL;
+	}
+
+	setup_ept_idle_ctrl(eic, buf, count, file->f_flags);
+
+	ret = ept_idle_walk_hva_range(eic, hva_start, hva_end);
+	if (ret)
+		goto out_kvm;
+
+	ret = eic->bytes_copied;
+	*ppos = eic->next_hva;
+	debug_printk("ppos=%lx bytes_copied=%d\n",
+		     eic->next_hva, ret);
+out_kvm:
+	return ret;
+}
+
+static int ept_idle_open(struct inode *inode, struct file *file)
+{
+	struct ept_idle_ctrl* eic;
+
+	if (!try_module_get(THIS_MODULE)) {
+		file->private_data = NULL;
+		return -EBUSY;
+	}
+
+	eic = kzalloc(sizeof(*eic), GFP_KERNEL);
+	file->private_data = eic;
+	if (!eic) {
+		printk(KERN_ERR"Failed to alloc ept_idle_ctrl \n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int ept_idle_release(struct inode *inode, struct file *file)
+{
+	struct kvm *kvm;
+	struct ept_idle_ctrl* eic = file->private_data;
+	int ret = 0;
+
+	if (!eic)
+		goto out;
+
+	if (eic->kvm) {
+		kvm = eic->kvm;
+		spin_lock(&kvm->mmu_lock);
+		kvm_flush_remote_tlbs(kvm);
+		spin_unlock(&kvm->mmu_lock);
+
+		kvm_put_kvm(kvm);
+	} else if (eic->mm) {
+		copied_flush_tlb_mm_range(eic->mm, 0UL, TLB_FLUSH_ALL, 0UL, true);
+	}
+
+	if (eic->mm)
+		mmput(eic->mm);
+
+	kfree(eic);
+out:
+	module_put(THIS_MODULE);
+	return ret;
+}
+
+static int mm_idle_pte_range(struct ept_idle_ctrl *eic, pmd_t *pmd,
+			     unsigned long addr, unsigned long next)
+{
+	enum ProcIdlePageType page_type;
+	pte_t *pte;
+	int err = 0;
+
+	pte = pte_offset_kernel(pmd, addr);
+	do {
+		if (!pte_present(*pte))
+			page_type = PTE_HOLE;
+		else if (!test_and_clear_bit(_PAGE_BIT_ACCESSED,
+					     (unsigned long *) &pte->pte))
+			page_type = PTE_IDLE;
+		else {
+			page_type = PTE_ACCESSED;
+		}
+
+		err = eic_add_page(eic, addr, addr + PAGE_SIZE, page_type);
+		if (err)
+			break;
+	} while (pte++, addr += PAGE_SIZE, addr != next);
+
+	return err;
+}
+
+static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr,
+			     unsigned long next, struct mm_walk *walk)
+{
+	struct ept_idle_ctrl *eic = walk->private;
+	enum ProcIdlePageType page_type;
+	enum ProcIdlePageType pte_page_type;
+	int err;
+
+	/*
+	 * Skip duplicate PMD_IDLE_PTES: when the PMD crosses VMA boundary,
+	 * walk_page_range() can call on the same PMD twice.
+	 */
+	if ((addr & PMD_MASK) == (eic->last_va & PMD_MASK)) {
+		debug_printk("ignore duplicate addr %lx %lx\n",
+			     addr, eic->last_va);
+		return 0;
+	}
+	eic->last_va = addr;
+
+	if (eic->flags & SCAN_HUGE_PAGE)
+		pte_page_type = PMD_IDLE_PTES;
+	else
+		pte_page_type = IDLE_PAGE_TYPE_MAX;
+#if 0
+	if (!pmd_present(*pmd))
+		page_type = PMD_HOLE;
+	else if (!test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pmd)) {
+		if (pmd_large(*pmd))
+			page_type = PMD_IDLE;
+		else if (eic->flags & SCAN_SKIM_IDLE)
+			page_type = PMD_IDLE_PTES;
+		else
+			page_type = pte_page_type;
+	} else if (pmd_large(*pmd)) {
+		page_type = PMD_ACCESSED;
+	} else
+		page_type = pte_page_type;
+#else
+	// don't clear A bit in PMD for 4K page, which conflicted with pmd_bad()
+	if (!pmd_present(*pmd))
+		page_type = PMD_HOLE;
+	else if (!pmd_large(*pmd))
+		page_type = pte_page_type;
+	else if (!test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pmd))
+		page_type = PMD_IDLE;
+	else
+		page_type = PMD_ACCESSED;
+#endif
+	if (page_type != IDLE_PAGE_TYPE_MAX)
+		err = eic_add_page(eic, addr, next, page_type);
+	else
+		err = mm_idle_pte_range(eic, pmd, addr, next);
+
+	return err;
+}
+
+static int mm_idle_pud_entry(pud_t *pud, unsigned long addr,
+			     unsigned long next, struct mm_walk *walk)
+{
+	struct ept_idle_ctrl *eic = walk->private;
+
+	if ((addr & PUD_MASK) != (eic->last_va & PUD_MASK)) {
+		eic_add_page(eic, addr, next, PUD_PRESENT);
+		eic->last_va = addr;
+	}
+	return 1;
+}
+
+static int mm_idle_test_walk(unsigned long start, unsigned long end,
+			     struct mm_walk *walk)
+{
+	struct vm_area_struct *vma = walk->vma;
+
+	if (vma->vm_file) {
+		if ((vma->vm_flags & (VM_WRITE|VM_MAYSHARE)) == VM_WRITE)
+		    return 0;
+		return 1;
+	}
+
+	return 0;
+}
+
+static int mm_idle_walk_range(struct ept_idle_ctrl *eic,
+			      unsigned long start,
+			      unsigned long end,
+			      struct mm_walk *walk)
+{
+	struct vm_area_struct *vma;
+	int ret;
+
+	ret = init_ept_idle_ctrl_buffer(eic);
+	if (ret)
+		return ret;
+
+	for (; start < end;)
+	{
+		down_read(&walk->mm->mmap_sem);
+		vma = find_vma(walk->mm, start);
+		if (vma) {
+			if (end > vma->vm_start) {
+				local_irq_disable();
+				ret = ept_idle_walk_page_range(start, end, walk);
+				local_irq_enable();
+			} else
+				set_restart_gpa(vma->vm_start, "VMA-HOLE");
+		} else
+			set_restart_gpa(TASK_SIZE, "EOF");
+		up_read(&walk->mm->mmap_sem);
+
+		WARN_ONCE(eic->gpa_to_hva, "non-zero gpa_to_hva");
+		start = eic->restart_gpa;
+		ret = ept_idle_copy_user(eic, start, end);
+		if (ret)
+			break;
+	}
+
+	if (eic->bytes_copied) {
+		if (ret != EPT_IDLE_BUF_FULL && eic->next_hva < end)
+			debug_printk("partial scan: next_hva=%lx end=%lx\n",
+				     eic->next_hva, end);
+		ret = 0;
+	} else
+		WARN_ONCE(1, "nothing read");
+	return ret;
+}
+
+static ssize_t mm_idle_read(struct file *file, char *buf,
+			    size_t count, loff_t *ppos)
+{
+	struct ept_idle_ctrl *eic = file->private_data;
+	struct mm_walk mm_walk = {};
+	unsigned long va_start = *ppos;
+	unsigned long va_end = va_start + (count << (3 + PAGE_SHIFT));
+	int ret;
+
+	if (va_end <= va_start) {
+		debug_printk("mm_idle_read past EOF: %lx %lx\n",
+			     va_start, va_end);
+		return 0;
+	}
+	if (*ppos & (PAGE_SIZE - 1)) {
+		debug_printk("mm_idle_read unaligned ppos: %lx\n",
+			     va_start);
+		return -EINVAL;
+	}
+	if (count < EPT_IDLE_BUF_MIN) {
+		debug_printk("mm_idle_read small count: %lx\n",
+			     (unsigned long)count);
+		return -EINVAL;
+	}
+
+	setup_ept_idle_ctrl(eic, buf, count, file->f_flags);
+
+	mm_walk.mm = eic->mm;
+	mm_walk.pmd_entry = mm_idle_pmd_entry;
+	mm_walk.pud_entry = mm_idle_pud_entry;
+	mm_walk.test_walk = mm_idle_test_walk;
+	mm_walk.private = eic;
+
+	ret = mm_idle_walk_range(eic, va_start, va_end, &mm_walk);
+	if (ret)
+		goto out_mm;
+
+	ret = eic->bytes_copied;
+	*ppos = eic->next_hva;
+	debug_printk("ppos=%lx bytes_copied=%d\n",
+		     eic->next_hva, ret);
+out_mm:
+	return ret;
+}
+
+// copied from fs/proc/base.c mem_lseek
+static loff_t ept_idle_lseek(struct file *file, loff_t offset, int orig)
+{
+	switch (orig) {
+	case 0:
+		file->f_pos = offset;
+		break;
+	case 1:
+		file->f_pos += offset;
+		break;
+	default:
+		return -EINVAL;
+	}
+	force_successful_syscall_return();
+	return file->f_pos;
+}
+
+static long ept_idle_ioctl(struct file *filp, unsigned int ioctl,
+							unsigned long arg)
+{
+	struct ept_idle_ctrl* eic;
+	pid_t target_pid = (pid_t)arg;
+	long ret;
+
+	eic = filp->private_data;
+	if (!eic) {
+		printk(KERN_ERR"NULL eic instance \n");
+		return -ENOMEM;
+	}
+
+	switch(ioctl) {
+	case IDLE_PAGE_SET_PID:
+		ret = get_mm_and_kvm_by_pid(target_pid, &eic->mm, &eic->kvm);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+    }
+
+	return ret;
+}
+
+struct file_operations proc_idle_page_oprations = {
+	.llseek	        = ept_idle_lseek,
+	.read           = ept_idle_read,
+	.open           = ept_idle_open,
+	.release        = ept_idle_release,
+	.unlocked_ioctl = ept_idle_ioctl
+};
+
+static int ept_idle_entry(void)
+{
+	dir_entry = proc_create("idle_pages", S_IWUSR | S_IRUGO, NULL,
+							&proc_idle_page_oprations);
+	if (!dir_entry) {
+		printk("Failed to create idle_pages in /porc\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void ept_idle_exit(void)
+{
+	if (dir_entry)
+		proc_remove(dir_entry);
+}
+
+MODULE_LICENSE("GPL");
+module_init(ept_idle_entry);
+module_exit(ept_idle_exit);
diff --git a/ept_idle.h b/ept_idle.h
new file mode 100644
index 0000000..c472eeb
--- /dev/null
+++ b/ept_idle.h
@@ -0,0 +1,123 @@
+#ifndef _EPT_IDLE_H
+#define _EPT_IDLE_H
+
+#include "ept_idle_common.h"
+
+#define SCAN_HUGE_PAGE		O_NONBLOCK	/* only huge page */
+#define SCAN_SKIM_IDLE		O_NOFOLLOW	/* stop on PMD_IDLE_PTES */
+#define SCAN_DIRTY_PAGE		O_NOATIME   /* report pte/pmd dirty bit */
+
+enum ProcIdlePageType {
+	PTE_ACCESSED,	/* 4k page */
+	PMD_ACCESSED,	/* 2M page */
+	PUD_PRESENT,	/* 1G page */
+
+	PTE_DIRTY,
+	PMD_DIRTY,
+
+	PTE_IDLE,
+	PMD_IDLE,
+	PMD_IDLE_PTES,	/* all PTE idle */
+
+	PTE_HOLE,
+	PMD_HOLE,
+
+	PIP_CMD,
+
+	IDLE_PAGE_TYPE_MAX
+};
+
+#define PIP_TYPE(a)		(0xf & (a >> 4))
+#define PIP_SIZE(a)		(0xf & a)
+#define PIP_COMPOSE(type, nr)	((type << 4) | nr)
+
+#define PIP_CMD_SET_HVA		PIP_COMPOSE(PIP_CMD, 0)
+
+#define _PAGE_BIT_EPT_ACCESSED	8
+#define _PAGE_BIT_EPT_DIRTY		9
+#define _PAGE_EPT_ACCESSED	(_AT(pteval_t, 1) << _PAGE_BIT_EPT_ACCESSED)
+#define _PAGE_EPT_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_EPT_DIRTY)
+
+#define _PAGE_EPT_PRESENT	(_AT(pteval_t, 7))
+
+static inline int ept_pte_present(pte_t a)
+{
+	return pte_flags(a) & _PAGE_EPT_PRESENT;
+}
+
+static inline int ept_pmd_present(pmd_t a)
+{
+	return pmd_flags(a) & _PAGE_EPT_PRESENT;
+}
+
+static inline int ept_pud_present(pud_t a)
+{
+	return pud_flags(a) & _PAGE_EPT_PRESENT;
+}
+
+static inline int ept_p4d_present(p4d_t a)
+{
+	return p4d_flags(a) & _PAGE_EPT_PRESENT;
+}
+
+static inline int ept_pgd_present(pgd_t a)
+{
+	return pgd_flags(a) & _PAGE_EPT_PRESENT;
+}
+
+static inline int ept_pte_accessed(pte_t a)
+{
+	return pte_flags(a) & _PAGE_EPT_ACCESSED;
+}
+
+static inline int ept_pmd_accessed(pmd_t a)
+{
+	return pmd_flags(a) & _PAGE_EPT_ACCESSED;
+}
+
+static inline int ept_pud_accessed(pud_t a)
+{
+	return pud_flags(a) & _PAGE_EPT_ACCESSED;
+}
+
+static inline int ept_p4d_accessed(p4d_t a)
+{
+	return p4d_flags(a) & _PAGE_EPT_ACCESSED;
+}
+
+static inline int ept_pgd_accessed(pgd_t a)
+{
+	return pgd_flags(a) & _PAGE_EPT_ACCESSED;
+}
+
+extern struct file_operations proc_ept_idle_operations;
+
+#define EPT_IDLE_KBUF_FULL	1
+#define EPT_IDLE_BUF_FULL	2
+#define EPT_IDLE_BUF_MIN	(sizeof(uint64_t) * 2 + 3)
+
+#define EPT_IDLE_KBUF_SIZE	8000
+
+#define IDLE_PAGE_SET_PID   _IOW(0x1, 0x1, pid_t)
+
+struct ept_idle_ctrl {
+	struct mm_struct *mm;
+	struct kvm *kvm;
+
+	uint8_t kpie[EPT_IDLE_KBUF_SIZE];
+	int pie_read;
+	int pie_read_max;
+
+	void __user *buf;
+	int buf_size;
+	int bytes_copied;
+
+	unsigned long next_hva;		/* GPA for EPT; VA for PT */
+	unsigned long gpa_to_hva;
+	unsigned long restart_gpa;
+	unsigned long last_va;
+
+	unsigned int flags;
+};
+
+#endif
diff --git a/ept_idle_common.h b/ept_idle_common.h
new file mode 100644
index 0000000..ee9e915
--- /dev/null
+++ b/ept_idle_common.h
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _EPT_IDLE_COMMON_H
+#define _EPT_IDLE_COMMON_H
+
+/* Fix leak of 5 level paging supporting on old kernel*/
+#ifndef CONFIG_PGTABLE_LEVELS
+  #define EPT_IDLE_5_LEVEL_PGTABLE_SUPPORT
+#else
+  #if CONFIG_PGTABLE_LEVELS < 4
+    #define EPT_IDLE_5_LEVEL_PGTABLE_SUPPORT
+  #endif // #if CONFIG_PGTABLE_LEVELS < 4
+#endif // #ifndef CONFIG_PGTABLE_LEVELS
+
+#ifdef EPT_IDLE_5_LEVEL_PGTABLE_SUPPORT
+
+#define p4d_t                    pgd_t
+#define p4d_flags                pgd_flags
+#define p4d_offset(pgd, start)   (pgd)
+#define p4d_addr_end(addr, end)  (end)
+#define p4d_present(p4d)         1
+#define p4d_ERROR(p4d)           do { } while(0)
+#define p4d_clear                pgd_clear
+#define p4d_none(p4d)            0
+#define p4d_bad(p4d)             0
+#define p4d_clear_bad            pgd_clear_bad
+#endif
+
+#ifndef pgd_offset_pgd
+#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address)))
+#endif
+
+
+#endif
diff --git a/ept_idle_native_pagewalk.c b/ept_idle_native_pagewalk.c
new file mode 100644
index 0000000..fed7605
--- /dev/null
+++ b/ept_idle_native_pagewalk.c
@@ -0,0 +1,465 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copied from kernel mm/pagewalk.c, modified by yuan.yao@intel.com
+
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+#include <linux/hugetlb.h>
+#include "ept_idle_common.h"
+
+#ifdef CONFIG_HUGETLB_PAGE
+int pmd_huge(pmd_t pmd)
+{
+	return !pmd_none(pmd) &&
+		(pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
+}
+
+int pud_huge(pud_t pud)
+{
+	return !!(pud_val(pud) & _PAGE_PSE);
+}
+
+/*
+ * ept_idle_huge_pte_offset() - Walk the page table to resolve the hugepage
+ * entry at address @addr
+ *
+ * Return: Pointer to page table or swap entry (PUD or PMD) for
+ * address @addr, or NULL if a p*d_none() entry is encountered and the
+ * size @sz doesn't match the hugepage size at this level of the page
+ * table.
+ */
+pte_t *ept_idle_huge_pte_offset(struct mm_struct *mm,
+		       unsigned long addr, unsigned long sz)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	pgd = pgd_offset(mm, addr);
+	if (!pgd_present(*pgd))
+		return NULL;
+	p4d = p4d_offset(pgd, addr);
+	if (!p4d_present(*p4d))
+		return NULL;
+
+	pud = pud_offset(p4d, addr);
+	if (sz != PUD_SIZE && pud_none(*pud))
+		return NULL;
+	/* hugepage or swap? */
+	if (pud_huge(*pud) || !pud_present(*pud))
+		return (pte_t *)pud;
+
+	pmd = pmd_offset(pud, addr);
+	if (sz != PMD_SIZE && pmd_none(*pmd))
+		return NULL;
+	/* hugepage or swap? */
+	if (pmd_huge(*pmd) || !pmd_present(*pmd))
+		return (pte_t *)pmd;
+
+	return NULL;
+}
+
+#else // #ifdef CONFIG_HUGETLB_PAGE
+#define pud_huge(x) 0
+#define pmd_huge(x) 0
+#define ept_idle_huge_pte_offset(mm, address, sz)	0
+#endif
+
+#ifndef VM_BUG_ON_VMA
+#define VM_BUG_ON_VMA(cond, vma)					\
+	do {								\
+		if (unlikely(cond)) {					\
+			BUG();						\
+		}							\
+	} while (0)
+
+#endif
+
+
+#ifndef VM_BUG_ON_MM
+#define VM_BUG_ON_MM VM_BUG_ON_VMA
+#endif
+
+static inline int ept_idle_p4d_none_or_clear_bad(p4d_t *p4d)
+{
+	if (p4d_none(*p4d))
+		return 1;
+	if (unlikely(p4d_bad(*p4d))) {
+		p4d_clear_bad(p4d);
+		return 1;
+	}
+	return 0;
+}
+
+
+static inline spinlock_t *ept_idle_pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
+{
+	spinlock_t *ptl;
+
+	VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
+
+	ptl = pud_lock(vma->vm_mm, pud);
+	if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
+		return ptl;
+	spin_unlock(ptl);
+	return NULL;
+}
+
+void p4d_clear_bad(p4d_t *p4d)
+{
+	p4d_ERROR(*p4d);
+	p4d_clear(p4d);
+}
+
+void pmd_clear_bad(pmd_t *pmd)
+{
+	pmd_ERROR(*pmd);
+	pmd_clear(pmd);
+}
+
+#ifdef _EPT_IDLE_SPLIT_PMD_
+static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+			  struct mm_walk *walk)
+{
+	pte_t *pte;
+	int err = 0;
+
+	pte = pte_offset_map(pmd, addr);
+	for (;;) {
+		err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
+		if (err)
+		       break;
+		addr += PAGE_SIZE;
+		if (addr == end)
+			break;
+		pte++;
+	}
+
+	pte_unmap(pte);
+	return err;
+}
+#endif
+
+static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+			  struct mm_walk *walk)
+{
+	pmd_t *pmd;
+	unsigned long next;
+	int err = 0;
+
+	pmd = pmd_offset(pud, addr);
+	do {
+#ifdef _EPT_IDLE_SPLIT_PMD_
+ again:
+#endif
+		next = pmd_addr_end(addr, end);
+		if (pmd_none(*pmd) || !walk->vma) {
+			if (walk->pte_hole)
+				err = walk->pte_hole(addr, next, walk);
+			if (err)
+				break;
+			continue;
+		}
+		/*
+		 * This implies that each ->pmd_entry() handler
+		 * needs to know about pmd_trans_huge() pmds
+		 */
+		if (walk->pmd_entry)
+			err = walk->pmd_entry(pmd, addr, next, walk);
+		if (err)
+			break;
+
+#ifdef _EPT_IDLE_SPLIT_PMD_
+		/*
+		 * Check this here so we only break down trans_huge
+		 * pages when we _need_ to
+		 */
+		if (!walk->pte_entry)
+			continue;
+
+		split_huge_pmd(walk->vma, pmd, addr);
+		if (pmd_trans_unstable(pmd))
+			goto again;
+
+		err = walk_pte_range(pmd, addr, next, walk);
+		if (err)
+			break;
+#endif
+	} while (pmd++, addr = next, addr != end);
+
+	return err;
+}
+
+static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
+			  struct mm_walk *walk)
+{
+	pud_t *pud;
+	unsigned long next;
+	int err = 0;
+
+	pud = pud_offset(p4d, addr);
+	do {
+#ifdef _EPT_IDLE_SPLIT_PUD_
+ again:
+#endif
+		next = pud_addr_end(addr, end);
+		if (pud_none(*pud) || !walk->vma) {
+			if (walk->pte_hole)
+				err = walk->pte_hole(addr, next, walk);
+			if (err)
+				break;
+			continue;
+		}
+
+		if (walk->pud_entry) {
+			spinlock_t *ptl = ept_idle_pud_trans_huge_lock(pud, walk->vma);
+
+			if (ptl) {
+				err = walk->pud_entry(pud, addr, next, walk);
+				spin_unlock(ptl);
+				if (err)
+					break;
+				continue;
+			}
+		}
+#ifdef _EPT_IDLE_SPLIT_PUD_
+		split_huge_pud(walk->vma, pud, addr);
+		if (pud_none(*pud))
+			goto again;
+#endif
+
+		if (walk->pmd_entry || walk->pte_entry)
+			err = walk_pmd_range(pud, addr, next, walk);
+		if (err)
+			break;
+
+	} while (pud++, addr = next, addr != end);
+
+	return err;
+}
+
+static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+			  struct mm_walk *walk)
+{
+	p4d_t *p4d;
+	unsigned long next;
+	int err = 0;
+
+	p4d = p4d_offset(pgd, addr);
+	do {
+		next = p4d_addr_end(addr, end);
+		if (ept_idle_p4d_none_or_clear_bad(p4d)) {
+			if (walk->pte_hole)
+				err = walk->pte_hole(addr, next, walk);
+			if (err)
+				break;
+			continue;
+		}
+		if (walk->pmd_entry || walk->pte_entry)
+			err = walk_pud_range(p4d, addr, next, walk);
+		if (err)
+			break;
+	} while (p4d++, addr = next, addr != end);
+
+	return err;
+}
+
+static int walk_pgd_range(unsigned long addr, unsigned long end,
+			  struct mm_walk *walk)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	int err = 0;
+
+	pgd = pgd_offset(walk->mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd)) {
+			if (walk->pte_hole)
+				err = walk->pte_hole(addr, next, walk);
+			if (err)
+				break;
+			continue;
+		}
+		if (walk->pmd_entry || walk->pte_entry)
+			err = walk_p4d_range(pgd, addr, next, walk);
+		if (err)
+			break;
+	} while (pgd++, addr = next, addr != end);
+
+	return err;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
+				       unsigned long end)
+{
+	unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
+	return boundary < end ? boundary : end;
+}
+
+static int walk_hugetlb_range(unsigned long addr, unsigned long end,
+			      struct mm_walk *walk)
+{
+	struct vm_area_struct *vma = walk->vma;
+	struct hstate *h = hstate_vma(vma);
+	unsigned long next;
+	unsigned long hmask = huge_page_mask(h);
+	unsigned long sz = huge_page_size(h);
+	pte_t *pte;
+	int err = 0;
+
+	do {
+		next = hugetlb_entry_end(h, addr, end);
+		pte = ept_idle_huge_pte_offset(walk->mm, addr & hmask, sz);
+
+		if (pte)
+			err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
+		else if (walk->pte_hole)
+			err = walk->pte_hole(addr, next, walk);
+
+		if (err)
+			break;
+	} while (addr = next, addr != end);
+
+	return err;
+}
+
+#else /* CONFIG_HUGETLB_PAGE */
+static int walk_hugetlb_range(unsigned long addr, unsigned long end,
+			      struct mm_walk *walk)
+{
+	return 0;
+}
+
+#endif /* CONFIG_HUGETLB_PAGE */
+
+/*
+ * Decide whether we really walk over the current vma on [@start, @end)
+ * or skip it via the returned value. Return 0 if we do walk over the
+ * current vma, and return 1 if we skip the vma. Negative values means
+ * error, where we abort the current walk.
+ */
+static int walk_page_test(unsigned long start, unsigned long end,
+			struct mm_walk *walk)
+{
+	struct vm_area_struct *vma = walk->vma;
+
+	if (walk->test_walk)
+		return walk->test_walk(start, end, walk);
+
+	/*
+	 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
+	 * range, so we don't walk over it as we do for normal vmas. However,
+	 * Some callers are interested in handling hole range and they don't
+	 * want to just ignore any single address range. Such users certainly
+	 * define their ->pte_hole() callbacks, so let's delegate them to handle
+	 * vma(VM_PFNMAP).
+	 */
+	if (vma->vm_flags & VM_PFNMAP) {
+		int err = 1;
+		if (walk->pte_hole)
+			err = walk->pte_hole(start, end, walk);
+		return err ? err : 1;
+	}
+	return 0;
+}
+
+static int __walk_page_range(unsigned long start, unsigned long end,
+			struct mm_walk *walk)
+{
+	int err = 0;
+	struct vm_area_struct *vma = walk->vma;
+
+	if (vma && is_vm_hugetlb_page(vma)) {
+		if (walk->hugetlb_entry)
+			err = walk_hugetlb_range(start, end, walk);
+	} else
+		err = walk_pgd_range(start, end, walk);
+
+	return err;
+}
+
+/**
+ * walk_page_range - walk page table with caller specific callbacks
+ * @start: start address of the virtual address range
+ * @end: end address of the virtual address range
+ * @walk: mm_walk structure defining the callbacks and the target address space
+ *
+ * Recursively walk the page table tree of the process represented by @walk->mm
+ * within the virtual address range [@start, @end). During walking, we can do
+ * some caller-specific works for each entry, by setting up pmd_entry(),
+ * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
+ * callbacks, the associated entries/pages are just ignored.
+ * The return values of these callbacks are commonly defined like below:
+ *
+ *  - 0  : succeeded to handle the current entry, and if you don't reach the
+ *         end address yet, continue to walk.
+ *  - >0 : succeeded to handle the current entry, and return to the caller
+ *         with caller specific value.
+ *  - <0 : failed to handle the current entry, and return to the caller
+ *         with error code.
+ *
+ * Before starting to walk page table, some callers want to check whether
+ * they really want to walk over the current vma, typically by checking
+ * its vm_flags. walk_page_test() and @walk->test_walk() are used for this
+ * purpose.
+ *
+ * struct mm_walk keeps current values of some common data like vma and pmd,
+ * which are useful for the access from callbacks. If you want to pass some
+ * caller-specific data to callbacks, @walk->private should be helpful.
+ *
+ * Locking:
+ *   Callers of walk_page_range() and walk_page_vma() should hold
+ *   @walk->mm->mmap_sem, because these function traverse vma list and/or
+ *   access to vma's data.
+ */
+int ept_idle_walk_page_range(unsigned long start, unsigned long end,
+		    struct mm_walk *walk)
+{
+	int err = 0;
+	unsigned long next;
+	struct vm_area_struct *vma;
+
+	if (start >= end)
+		return -EINVAL;
+
+	if (!walk->mm)
+		return -EINVAL;
+
+	VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm);
+
+	vma = find_vma(walk->mm, start);
+	do {
+		if (!vma) { /* after the last vma */
+			walk->vma = NULL;
+			next = end;
+		} else if (start < vma->vm_start) { /* outside vma */
+			walk->vma = NULL;
+			next = min(end, vma->vm_start);
+		} else { /* inside vma */
+			walk->vma = vma;
+			next = min(end, vma->vm_end);
+			vma = vma->vm_next;
+
+			err = walk_page_test(start, next, walk);
+			if (err > 0) {
+				/*
+				 * positive return values are purely for
+				 * controlling the pagewalk, so should never
+				 * be passed to the callers.
+				 */
+				err = 0;
+				continue;
+			}
+			if (err < 0)
+				break;
+		}
+		if (walk->vma || walk->pte_hole)
+			err = __walk_page_range(start, next, walk);
+		if (err)
+			break;
+	} while (start = next, start < end);
+	return err;
+}
diff --git a/ept_idle_native_pagewalk.h b/ept_idle_native_pagewalk.h
new file mode 100644
index 0000000..42d07b1
--- /dev/null
+++ b/ept_idle_native_pagewalk.h
@@ -0,0 +1,7 @@
+#ifndef _EPT_IDLE_NATIVE_PAGEWALK_H
+#define _EPT_IDLE_NATIVE_PAGEWALK_H
+
+int ept_idle_walk_page_range(unsigned long start, unsigned long end,
+                             struct mm_walk *walk);
+
+#endif
diff --git a/tlb_flush.c b/tlb_flush.c
new file mode 100644
index 0000000..974ec41
--- /dev/null
+++ b/tlb_flush.c
@@ -0,0 +1,288 @@
+#include "tlb_flush.h"
+
+
+/* copied from 4.20 kernel:
+ * See Documentation/x86/tlb.txt for details.  We choose 33
+ * because it is large enough to cover the vast majority (at
+ * least 95%) of allocations, and is small enough that we are
+ * confident it will not cause too much overhead.  Each single
+ * flush is about 100 ns, so this caps the maximum overhead at
+ * _about_ 3,000 ns.
+ *
+ * This is in units of pages.
+ */
+static unsigned long copied_tlb_single_page_flush_ceiling __read_mostly = 33;
+
+
+static bool copied_tlb_is_not_lazy(int cpu, void *data)
+{
+	return !per_cpu(cpu_tlbstate.is_lazy, cpu);
+}
+
+
+/*
+ * flush_tlb_func_common()'s memory ordering requirement is that any
+ * TLB fills that happen after we flush the TLB are ordered after we
+ * read active_mm's tlb_gen.  We don't need any explicit barriers
+ * because all x86 flush operations are serializing and the
+ * atomic64_read operation won't be reordered by the compiler.
+ */
+static void copied_flush_tlb_func_common(const struct flush_tlb_info *f,
+				  bool local, enum tlb_flush_reason reason)
+{
+	/*
+	 * We have three different tlb_gen values in here.  They are:
+	 *
+	 * - mm_tlb_gen:     the latest generation.
+	 * - local_tlb_gen:  the generation that this CPU has already caught
+	 *                   up to.
+	 * - f->new_tlb_gen: the generation that the requester of the flush
+	 *                   wants us to catch up to.
+	 */
+	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+	u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
+	u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+
+	/* This code cannot presently handle being reentered. */
+	VM_WARN_ON(!irqs_disabled());
+
+	/*
+	 * The init_mm is unexported variable, but we don't need
+	 * check this here for our case, we just want to flush
+	 * the TLB on remote CPU cores which is running the task
+	 * using f->mm as memory space
+	 */
+#if 0
+	if (unlikely(loaded_mm == &init_mm))
+		return;
+#else
+	if (unlikely(loaded_mm != f->mm)) {
+		return;
+	}
+#endif
+
+	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
+		   loaded_mm->context.ctx_id);
+
+	/*
+	 * The caller of this function will set is_lazy to false explicitly
+	 * so we don't need handle this case, just skip this.
+	 */
+ #if 0
+	if (this_cpu_read(cpu_tlbstate.is_lazy)) {
+		/*
+		 * We're in lazy mode.  We need to at least flush our
+		 * paging-structure cache to avoid speculatively reading
+		 * garbage into our TLB.  Since switching to init_mm is barely
+		 * slower than a minimal flush, just switch to init_mm.
+		 *
+		 * This should be rare, with native_flush_tlb_others skipping
+		 * IPIs to lazy TLB mode CPUs.
+		 */
+		switch_mm_irqs_off(NULL, &init_mm, NULL);
+		return;
+	}
+#endif
+
+	if (unlikely(local_tlb_gen == mm_tlb_gen)) {
+		/*
+		 * There's nothing to do: we're already up to date.  This can
+		 * happen if two concurrent flushes happen -- the first flush to
+		 * be handled can catch us all the way up, leaving no work for
+		 * the second flush.
+		 */
+		// trace_tlb_flush(reason, 0);
+		return;
+	}
+
+	WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
+	WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
+
+	/*
+	 * If we get to this point, we know that our TLB is out of date.
+	 * This does not strictly imply that we need to flush (it's
+	 * possible that f->new_tlb_gen <= local_tlb_gen), but we're
+	 * going to need to flush in the very near future, so we might
+	 * as well get it over with.
+	 *
+	 * The only question is whether to do a full or partial flush.
+	 *
+	 * We do a partial flush if requested and two extra conditions
+	 * are met:
+	 *
+	 * 1. f->new_tlb_gen == local_tlb_gen + 1.  We have an invariant that
+	 *    we've always done all needed flushes to catch up to
+	 *    local_tlb_gen.  If, for example, local_tlb_gen == 2 and
+	 *    f->new_tlb_gen == 3, then we know that the flush needed to bring
+	 *    us up to date for tlb_gen 3 is the partial flush we're
+	 *    processing.
+	 *
+	 *    As an example of why this check is needed, suppose that there
+	 *    are two concurrent flushes.  The first is a full flush that
+	 *    changes context.tlb_gen from 1 to 2.  The second is a partial
+	 *    flush that changes context.tlb_gen from 2 to 3.  If they get
+	 *    processed on this CPU in reverse order, we'll see
+	 *     local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
+	 *    If we were to use __flush_tlb_one_user() and set local_tlb_gen to
+	 *    3, we'd be break the invariant: we'd update local_tlb_gen above
+	 *    1 without the full flush that's needed for tlb_gen 2.
+	 *
+	 * 2. f->new_tlb_gen == mm_tlb_gen.  This is purely an optimiation.
+	 *    Partial TLB flushes are not all that much cheaper than full TLB
+	 *    flushes, so it seems unlikely that it would be a performance win
+	 *    to do a partial flush if that won't bring our TLB fully up to
+	 *    date.  By doing a full flush instead, we can increase
+	 *    local_tlb_gen all the way to mm_tlb_gen and we can probably
+	 *    avoid another flush in the very near future.
+	 */
+	if (f->end != TLB_FLUSH_ALL &&
+	    f->new_tlb_gen == local_tlb_gen + 1 &&
+	    f->new_tlb_gen == mm_tlb_gen) {
+		/* Partial flush */
+		unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
+		unsigned long addr = f->start;
+
+		while (addr < f->end) {
+			__flush_tlb_one_user(addr);
+			addr += 1UL << f->stride_shift;
+		}
+		if (local)
+			count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
+		// trace_tlb_flush(reason, nr_invalidate);
+	} else {
+		/* Full flush. */
+		local_flush_tlb();
+		if (local)
+			count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+		// trace_tlb_flush(reason, TLB_FLUSH_ALL);
+	}
+
+	/* Both paths above update our state to mm_tlb_gen. */
+	this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
+}
+
+static void copied_flush_tlb_func_remote(void *info)
+{
+	const struct flush_tlb_info *f = info;
+	bool saved_lazy;
+
+	inc_irq_stat(irq_tlb_count);
+
+	if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
+		return;
+
+	saved_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
+	this_cpu_write(cpu_tlbstate.is_lazy, false);
+
+	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+	copied_flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
+
+	this_cpu_write(cpu_tlbstate.is_lazy, saved_lazy);
+}
+
+
+static void copied_native_flush_tlb_others(const struct cpumask *cpumask,
+			     const struct flush_tlb_info *info)
+{
+	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+
+#if 0
+	if (info->end == TLB_FLUSH_ALL)
+		trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
+	else
+		trace_tlb_flush(TLB_REMOTE_SEND_IPI,
+				(info->end - info->start) >> PAGE_SHIFT);
+#endif
+	/*
+	 * Use non-UV system way in first version to reduce porting affort,
+	 * we will support UV system later if necessary
+	 */
+#if 0
+	if (is_uv_system()) {
+		/*
+		 * This whole special case is confused.  UV has a "Broadcast
+		 * Assist Unit", which seems to be a fancy way to send IPIs.
+		 * Back when x86 used an explicit TLB flush IPI, UV was
+		 * optimized to use its own mechanism.  These days, x86 uses
+		 * smp_call_function_many(), but UV still uses a manual IPI,
+		 * and that IPI's action is out of date -- it does a manual
+		 * flush instead of calling flush_tlb_func_remote().  This
+		 * means that the percpu tlb_gen variables won't be updated
+		 * and we'll do pointless flushes on future context switches.
+		 *
+		 * Rather than hooking native_flush_tlb_others() here, I think
+		 * that UV should be updated so that smp_call_function_many(),
+		 * etc, are optimal on UV.
+		 */
+		unsigned int cpu;
+
+		cpu = smp_processor_id();
+		cpumask = uv_flush_tlb_others(cpumask, info);
+		if (cpumask)
+			smp_call_function_many(cpumask, copied_flush_tlb_func_remote,
+					       (void *)info, 1);
+		return;
+	}
+#endif
+
+	/*
+	 * If no page tables were freed, we can skip sending IPIs to
+	 * CPUs in lazy TLB mode. They will flush the CPU themselves
+	 * at the next context switch.
+	 *
+	 * However, if page tables are getting freed, we need to send the
+	 * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
+	 * up on the new contents of what used to be page tables, while
+	 * doing a speculative memory access.
+	 */
+	if (info->freed_tables)
+		smp_call_function_many(cpumask, copied_flush_tlb_func_remote,
+			       (void *)info, 1);
+	else
+		on_each_cpu_cond_mask(copied_tlb_is_not_lazy, copied_flush_tlb_func_remote,
+				(void *)info, 1, GFP_ATOMIC, cpumask);
+}
+
+
+void copied_flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+				unsigned long end, unsigned int stride_shift,
+				bool freed_tables)
+{
+	int cpu;
+
+	struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
+		.mm = mm,
+		.stride_shift = stride_shift,
+		.freed_tables = freed_tables,
+	};
+
+	cpu = get_cpu();
+
+	/* This is also a barrier that synchronizes with switch_mm(). */
+	info.new_tlb_gen = inc_mm_tlb_gen(mm);
+
+	/* Should we flush just the requested range? */
+	if ((end != TLB_FLUSH_ALL) &&
+	    ((end - start) >> stride_shift) <= copied_tlb_single_page_flush_ceiling) {
+		info.start = start;
+		info.end = end;
+	} else {
+		info.start = 0UL;
+		info.end = TLB_FLUSH_ALL;
+	}
+
+	/* This should never happend in our case */
+	if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
+		VM_WARN_ON(irqs_disabled());
+		local_irq_disable();
+		copied_flush_tlb_func_common(&info, true, TLB_LOCAL_MM_SHOOTDOWN);
+		local_irq_enable();
+	}
+
+	if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
+		copied_native_flush_tlb_others(mm_cpumask(mm), &info);
+
+	put_cpu();
+}
+
diff --git a/tlb_flush.h b/tlb_flush.h
new file mode 100644
index 0000000..ca24adf
--- /dev/null
+++ b/tlb_flush.h
@@ -0,0 +1,10 @@
+#ifndef _TLB_FLUSH_H
+#define _TLB_FLUSH_H
+
+#include <asm/tlbflush.h>
+
+void copied_flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+                               unsigned long end, unsigned int stride_shift,
+                               bool freed_tables);
+
+#endif
-- 
GitLab