diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index a4dbaa42e49d55b510bec800e7c3f33199a23126..682fbaf9e592734cf43e2eaf11c9472d7a9b9eb8 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1036,6 +1036,7 @@ CONFIG_FRAME_VECTOR=y # CONFIG_GUP_BENCHMARK is not set # CONFIG_READ_ONLY_THP_FOR_FS is not set CONFIG_ARCH_HAS_PTE_SPECIAL=y +CONFIG_PIN_MEMORY=y # end of Memory Management options CONFIG_NET=y @@ -3282,6 +3283,7 @@ CONFIG_TCG_TIS_ST33ZP24_SPI=y # CONFIG_RANDOM_TRUST_CPU is not set # CONFIG_RANDOM_TRUST_BOOTLOADER is not set +CONFIG_PIN_MEMORY_DEV=m # # I2C support diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index c1f1fb91e4ce6fceb913b5b05b13ab541432b0f4..5e282d31ade44a6f3736a8369cbb12e9649ea879 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -50,6 +50,9 @@ #include #include #include +#ifdef CONFIG_PIN_MEMORY +#include +#endif static int num_standard_resources; static struct resource *standard_resources; @@ -259,6 +262,12 @@ static void __init request_standard_resources(void) quick_kexec_res.start >= res->start && quick_kexec_res.end <= res->end) request_resource(res, &quick_kexec_res); +#endif +#ifdef CONFIG_PIN_MEMORY + if (pin_memory_resource.end && + pin_memory_resource.start >= res->start && + pin_memory_resource.end <= res->end) + request_resource(res, &pin_memory_resource); #endif } } diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index f3e5a66c91a76bb295c7569ecb02aecb6e24405f..b3437440d9b9c9f78fd885d05c00f6f663ba0d93 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -42,6 +42,9 @@ #include #include #include +#ifdef CONFIG_PIN_MEMORY +#include +#endif #define ARM64_ZONE_DMA_BITS 30 @@ -78,6 +81,55 @@ static void __init reserve_crashkernel(void) */ #define MAX_USABLE_RANGES 2 +#ifdef CONFIG_PIN_MEMORY +struct resource pin_memory_resource = { + .name = "Pin memory", + .start = 0, + .end = 0, + .flags = IORESOURCE_MEM, + .desc = IORES_DESC_RESERVED +}; + +static void __init reserve_pin_memory_res(void) +{ + unsigned long long mem_start, mem_len; + int ret; + + ret = parse_pin_memory(boot_command_line, memblock_phys_mem_size(), + &mem_len, &mem_start); + if (ret || !mem_len) + return; + + mem_len = PAGE_ALIGN(mem_len); + + if (!memblock_is_region_memory(mem_start, mem_len)) { + pr_warn("cannot reserve for pin memory: region is not memory!\n"); + return; + } + + if (memblock_is_region_reserved(mem_start, mem_len)) { + pr_warn("cannot reserve for pin memory: region overlaps reserved memory!\n"); + return; + } + + if (!IS_ALIGNED(mem_start, SZ_2M)) { + pr_warn("cannot reserve for pin memory: base address is not 2MB aligned\n"); + return; + } + + memblock_reserve(mem_start, mem_len); + pr_debug("pin memory resource reserved: 0x%016llx - 0x%016llx (%lld MB)\n", + mem_start, mem_start + mem_len, mem_len >> 20); + + pin_memory_resource.start = mem_start; + pin_memory_resource.end = mem_start + mem_len - 1; +} +#else +static void __init reserve_pin_memory_res(void) +{ +} +#endif /* CONFIG_PIN_MEMORY */ + #ifdef CONFIG_CRASH_DUMP static int __init early_init_dt_scan_elfcorehdr(unsigned long node, const char *uname, int depth, void *data) @@ -455,6 +507,8 @@ void __init arm64_memblock_init(void) reserve_park_mem(); #endif + reserve_pin_memory_res(); + reserve_elfcorehdr(); high_memory = __va(memblock_end_of_DRAM() - 1) + 1; @@ -583,6 +637,12 @@ void __init mem_init(void) /* this will put all unused low memory onto the freelists */ memblock_free_all(); +#ifdef CONFIG_PIN_MEMORY + /* pre alloc the pages for pin memory */ + init_reserve_page_map((unsigned long)pin_memory_resource.start, + (unsigned long)(pin_memory_resource.end - pin_memory_resource.start + 1)); +#endif + mem_init_print_info(NULL); /* diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index d229a2d0c017493a858636ed947ba1f953dfad78..02dca67cb2c5bd42d9b6cbf79a50be58c6412f49 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -496,3 +496,10 @@ config RANDOM_TRUST_BOOTLOADER booloader is trustworthy so it will be added to the kernel's entropy pool. Otherwise, say N here so it will be regarded as device input that only mixes the entropy pool. + +config PIN_MEMORY_DEV + tristate "/dev/pinmem character device" + depends on PIN_MEMORY + default m + help + pin memory driver diff --git a/drivers/char/Makefile b/drivers/char/Makefile index ffce287ef41551d3819dbc250ca7d74b415f326d..71d76fd62692fe64859ca464984b0bdd8a9454f9 100644 --- a/drivers/char/Makefile +++ b/drivers/char/Makefile @@ -47,3 +47,4 @@ obj-$(CONFIG_PS3_FLASH) += ps3flash.o obj-$(CONFIG_XILLYBUS) += xillybus/ obj-$(CONFIG_POWERNV_OP_PANEL) += powernv-op-panel.o obj-$(CONFIG_ADI) += adi.o +obj-$(CONFIG_PIN_MEMORY_DEV) += pin_memory.o diff --git a/drivers/char/pin_memory.c b/drivers/char/pin_memory.c new file mode 100644 index 0000000000000000000000000000000000000000..3e3ce4dd273178222b37a463ddb76666d2b7e465 --- /dev/null +++ b/drivers/char/pin_memory.c @@ -0,0 +1,208 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * Pin memory driver for checkpoint and restore. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_PIN_MEM_AREA_NUM 16 +struct _pin_mem_area { + unsigned long virt_start; + unsigned long virt_end; +}; + +struct pin_mem_area_set { + unsigned int pid; + unsigned int area_num; + struct _pin_mem_area mem_area[MAX_PIN_MEM_AREA_NUM]; +}; + +#define PIN_MEM_MAGIC 0x59 +#define _SET_PIN_MEM_AREA 1 +#define _CLEAR_PIN_MEM_AREA 2 +#define _REMAP_PIN_MEM_AREA 3 +#define _FINISH_PIN_MEM_DUMP 4 +#define _PIN_MEM_IOC_MAX_NR 4 +#define SET_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _SET_PIN_MEM_AREA, struct pin_mem_area_set) +#define CLEAR_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _CLEAR_PIN_MEM_AREA, int) +#define REMAP_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _REMAP_PIN_MEM_AREA, int) +#define FINISH_PIN_MEM_DUMP _IOW(PIN_MEM_MAGIC, _FINISH_PIN_MEM_DUMP, int) +static int set_pin_mem(struct pin_mem_area_set *pmas) +{ + int i; + int ret = 0; + struct _pin_mem_area *pma; + struct mm_struct *mm; + struct task_struct *task; + struct pid *pid_s; + + pid_s = find_get_pid(pmas->pid); + if (!pid_s) { + pr_warn("Get pid struct fail:%d.\n", pmas->pid); + return -EFAULT; + } + rcu_read_lock(); + task = pid_task(pid_s, PIDTYPE_PID); + if (!task) { + pr_warn("Get task struct fail:%d.\n", pmas->pid); + goto fail; + } + mm = get_task_mm(task); + for (i = 0; i < pmas->area_num; i++) { + pma = &(pmas->mem_area[i]); + ret = pin_mem_area(task, mm, pma->virt_start, pma->virt_end); + if (ret) { + mmput(mm); + goto fail; + } + } + mmput(mm); + rcu_read_unlock(); + put_pid(pid_s); + return ret; + +fail: + rcu_read_unlock(); + put_pid(pid_s); + return -EFAULT; +} + +static int set_pin_mem_area(unsigned long arg) +{ + struct pin_mem_area_set pmas; + void __user *buf = (void __user *)arg; + + if (!access_ok(buf, sizeof(pmas))) + return -EFAULT; + if (copy_from_user(&pmas, buf, sizeof(pmas))) + return -EINVAL; + if (pmas.area_num > MAX_PIN_MEM_AREA_NUM) { + pr_warn("Input area_num is too large.\n"); + return -EINVAL; + } + + return set_pin_mem(&pmas); +} + +static int pin_mem_remap(unsigned long arg) +{ + int pid; + struct task_struct *task; + struct mm_struct *mm; + vm_fault_t ret; + void __user *buf = (void __user *)arg; + struct pid *pid_s; + + if (!access_ok(buf, sizeof(int))) + return -EINVAL; + if (copy_from_user(&pid, buf, sizeof(int))) + return -EINVAL; + + pid_s = find_get_pid(pid); + if (!pid_s) { + pr_warn("Get pid struct fail:%d.\n", pid); + return -EINVAL; + } + rcu_read_lock(); + task = pid_task(pid_s, PIDTYPE_PID); + if (!task) { + pr_warn("Get task struct fail:%d.\n", pid); + goto fault; + } + mm = get_task_mm(task); + ret = do_mem_remap(pid, mm); + if (ret) { + pr_warn("Handle pin memory remap fail.\n"); + mmput(mm); + goto fault; + } + mmput(mm); + rcu_read_unlock(); + put_pid(pid_s); + return 0; + +fault: + rcu_read_unlock(); + put_pid(pid_s); + return -EFAULT; +} + +static long pin_memory_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long ret = 0; + + if (_IOC_TYPE(cmd) != PIN_MEM_MAGIC) + return -EINVAL; + if (_IOC_NR(cmd) > _PIN_MEM_IOC_MAX_NR) + return -EINVAL; + + switch (cmd) { + case SET_PIN_MEM_AREA: + ret = set_pin_mem_area(arg); + break; + case CLEAR_PIN_MEM_AREA: + clear_pin_memory_record(); + break; + case REMAP_PIN_MEM_AREA: + ret = pin_mem_remap(arg); + break; + case FINISH_PIN_MEM_DUMP: + ret = finish_pin_mem_dump(); + break; + default: + return -EINVAL; + } + return ret; +} + +static const struct file_operations pin_memory_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = pin_memory_ioctl, + .compat_ioctl = pin_memory_ioctl, +}; + +static struct miscdevice pin_memory_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "pinmem", + .fops = &pin_memory_fops, +}; + +static int pin_memory_init(void) +{ + int err = misc_register(&pin_memory_miscdev); + + if (!err) + pr_info("pin_memory init\n"); + else + pr_warn("pin_memory init failed!\n"); + return err; +} + +static void pin_memory_exit(void) +{ + misc_deregister(&pin_memory_miscdev); + pr_info("pin_memory ko exists!\n"); +} + +module_init(pin_memory_init); +module_exit(pin_memory_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Euler"); +MODULE_DESCRIPTION("pin memory"); diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index fc0ef33a76f738fc1a37d01d95e5cc44794aa327..30f0df3cfbfb48c80eae34852a5a9a215e4fe979 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -87,4 +87,9 @@ int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base); +#ifdef CONFIG_PIN_MEMORY +int __init parse_pin_memory(char *cmdline, unsigned long long system_ram, + unsigned long long *pin_size, unsigned long long *pin_base); +#endif + #endif /* LINUX_CRASH_CORE_H */ diff --git a/include/linux/pin_mem.h b/include/linux/pin_mem.h new file mode 100644 index 0000000000000000000000000000000000000000..af1d4e5ceca2e1bd7a3fca90a083bf558ab37bb8 --- /dev/null +++ b/include/linux/pin_mem.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * Provide the pin memory method for checkpoint and restore task. + */ +#ifndef _LINUX_PIN_MEMORY_H +#define _LINUX_PIN_MEMORY_H + +#ifdef CONFIG_PIN_MEMORY +#include +#include +#include +#ifdef CONFIG_ARM64 +#include +#endif + +#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) + +#define COLLECT_PAGES_FINISH 0 +#define COLLECT_PAGES_NEED_CONTINUE 1 +#define COLLECT_PAGES_FAIL -1 + +#define COMPOUND_PAD_MASK 0xffffffff +#define COMPOUND_PAD_START 0x88 +#define COMPOUND_PAD_DELTA 0x40 +#define LIST_POISON4 0xdead000000000400 +#define PAGE_FLAGS_CHECK_RESERVED (1UL << PG_reserved) +#define SHA256_DIGEST_SIZE 32 +#define next_pme(pme) ((unsigned long *)(pme + 1) + pme->nr_pages) +#define PIN_MEM_DUMP_MAGIC 0xfeab000000001acd +struct page_map_entry { + unsigned long virt_addr; + unsigned int nr_pages; + unsigned int is_huge_page; + unsigned long redirect_start; + unsigned long phy_addr_array[0]; +}; + +struct page_map_info { + int pid; + int pid_reserved; + unsigned int entry_num; + int disable_free_page; + struct page_map_entry *pme; +}; + +struct pin_mem_dump_info { + char sha_digest[SHA256_DIGEST_SIZE]; + unsigned long magic; + unsigned int pin_pid_num; + struct page_map_info pmi_array[0]; +}; + +struct redirect_info { + unsigned int redirect_pages; + unsigned int redirect_index[0]; +}; + +extern struct page_map_info *get_page_map_info(int pid); +extern struct page_map_info *create_page_map_info(int pid); +extern vm_fault_t do_mem_remap(int pid, struct mm_struct *mm); +extern vm_fault_t do_anon_page_remap(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, struct page *page); +extern void clear_pin_memory_record(void); +extern int pin_mem_area(struct task_struct *task, struct mm_struct *mm, + unsigned long start_addr, unsigned long end_addr); +extern vm_fault_t do_anon_huge_page_remap(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, struct page *page); +extern int finish_pin_mem_dump(void); + +/* reserve space for pin memory*/ +#ifdef CONFIG_ARM64 +extern struct resource pin_memory_resource; +#endif +extern void init_reserve_page_map(unsigned long map_addr, unsigned long map_size); + +#endif /* CONFIG_PIN_MEMORY */ +#endif /* _LINUX_PIN_MEMORY_H */ diff --git a/kernel/crash_core.c b/kernel/crash_core.c index bfed474b8da6c9ea2a2b952f627edad4949c7f7a..2407de35cc1a80016ee8f913c50818f23ea3b56b 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -450,6 +450,17 @@ void __init reserve_crashkernel(void) } #endif /* CONFIG_ARCH_WANT_RESERVE_CRASH_KERNEL */ +#ifdef CONFIG_PIN_MEMORY +int __init parse_pin_memory(char *cmdline, + unsigned long long system_ram, + unsigned long long *pin_size, + unsigned long long *pin_base) +{ + return __parse_crashkernel(cmdline, system_ram, pin_size, pin_base, + "pinmemory=", NULL); +} +#endif + Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len) { diff --git a/mm/Kconfig b/mm/Kconfig index 390165ffbb0fc2395a7460836eea971e0c0176f3..930dc1390951f8171152c2c197422122a0d165ae 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -859,4 +859,12 @@ config ARCH_HAS_HUGEPD config MAPPING_DIRTY_HELPERS bool +config PIN_MEMORY + bool "Support for pin memory" + depends on CHECKPOINT_RESTORE + help + Say y here to enable the pin memory feature for checkpoint + and restore. We can pin the memory data of tasks and collect + the corresponding physical pages mapping info in checkpoint, + and remap the physical pages to restore tasks in restore. endmenu diff --git a/mm/Makefile b/mm/Makefile index d73aed0fc99c1d408090c8175f482bbd24a6f2a9..49638279c3f6bc6778cb28f11d98c79f3fca10b4 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -120,3 +120,4 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o +obj-$(CONFIG_PIN_MEMORY) += pin_mem.o diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0bc4a2cae1e5609b09de4bd5390e1c0745fc87c3..8a11d30f674c1c0b98a96cc0797d5c30dc96815a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2996,3 +2996,64 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) update_mmu_cache_pmd(vma, address, pvmw->pmd); } #endif + +#ifdef CONFIG_PIN_MEMORY +vm_fault_t do_anon_huge_page_remap(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, struct page *page) +{ + gfp_t gfp; + pgtable_t pgtable; + spinlock_t *ptl; + pmd_t entry; + vm_fault_t ret = 0; + + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + if (unlikely(khugepaged_enter(vma, vma->vm_flags))) + return VM_FAULT_OOM; + gfp = alloc_hugepage_direct_gfpmask(vma); + prep_transhuge_page(page); + if (mem_cgroup_charge(page, vma->vm_mm, gfp)) { + put_page(page); + count_vm_event(THP_FAULT_FALLBACK); + count_vm_event(THP_FAULT_FALLBACK_CHARGE); + return VM_FAULT_FALLBACK; + } + cgroup_throttle_swaprate(page, gfp); + + pgtable = pte_alloc_one(vma->vm_mm); + if (unlikely(!pgtable)) { + ret = VM_FAULT_OOM; + goto release; + } + __SetPageUptodate(page); + ptl = pmd_lock(vma->vm_mm, pmd); + if (unlikely(!pmd_none(*pmd))) { + goto unlock_release; + } else { + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto unlock_release; + entry = mk_huge_pmd(page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + page_add_new_anon_rmap(page, vma, address, true); + lru_cache_add_inactive_or_unevictable(page, vma); + pgtable_trans_huge_deposit(vma->vm_mm, pmd, pgtable); + set_pmd_at(vma->vm_mm, address, pmd, entry); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + mm_inc_nr_ptes(vma->vm_mm); + spin_unlock(ptl); + count_vm_event(THP_FAULT_ALLOC); + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); + } + + return 0; +unlock_release: + spin_unlock(ptl); +release: + if (pgtable) + pte_free(vma->vm_mm, pgtable); + put_page(page); + return ret; +} +#endif diff --git a/mm/memory.c b/mm/memory.c index 50632c4366b8ab5d66dfd162847bf7ecfc9c6d97..7b7f1a7813bc2813cc42463d55b54e4cdfab6d07 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5248,3 +5248,62 @@ void ptlock_free(struct page *page) kmem_cache_free(page_ptl_cachep, page->ptl); } #endif + +#ifdef CONFIG_PIN_MEMORY +vm_fault_t do_anon_page_remap(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, struct page *page) +{ + pte_t entry; + spinlock_t *ptl; + pte_t *pte; + vm_fault_t ret = 0; + + if (pte_alloc(vma->vm_mm, pmd)) + return VM_FAULT_OOM; + + /* See the comment in pte_alloc_one_map() */ + if (unlikely(pmd_trans_unstable(pmd))) + return 0; + + /* Allocate our own private page. */ + if (unlikely(anon_vma_prepare(vma))) + goto oom; + + if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) + goto oom_free_page; + cgroup_throttle_swaprate(page, GFP_KERNEL); + + __SetPageUptodate(page); + + entry = mk_pte(page, vma->vm_page_prot); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); + pte = pte_offset_map_lock(vma->vm_mm, pmd, address, + &ptl); + if (!pte_none(*pte)) { + ret = VM_FAULT_FALLBACK; + goto release; + } + + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto release; + inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, address, false); + lru_cache_add_inactive_or_unevictable(page, vma); + + set_pte_at(vma->vm_mm, address, pte, entry); + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, pte); +unlock: + pte_unmap_unlock(pte, ptl); + return ret; +release: + put_page(page); + goto unlock; +oom_free_page: + put_page(page); +oom: + return VM_FAULT_OOM; +} +#endif diff --git a/mm/pin_mem.c b/mm/pin_mem.c new file mode 100644 index 0000000000000000000000000000000000000000..13b14686ba83c201f721ca4358b198bebd79444b --- /dev/null +++ b/mm/pin_mem.c @@ -0,0 +1,954 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * Provide the pin memory method for checkpoint and restore task. + */ +#ifdef CONFIG_PIN_MEMORY +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_PIN_PID_NUM 128 +static DEFINE_SPINLOCK(page_map_entry_lock); + +struct pin_mem_dump_info *pin_mem_dump_start; +unsigned int pin_pid_num; +static unsigned int *pin_pid_num_addr; +static struct page_map_entry *__page_map_entry_start; +static unsigned long page_map_entry_end; +static struct page_map_info *user_space_reserve_start; +static struct page_map_entry *page_map_entry_start; +unsigned int max_pin_pid_num __read_mostly; +unsigned long redirect_space_size; +unsigned long redirect_space_start; +#define DEFAULT_REDIRECT_SPACE_SIZE 0x100000 + +static int __init setup_max_pin_pid_num(char *str) +{ + int ret = 0; + + if (!str) + goto out; + + ret = kstrtouint(str, 10, &max_pin_pid_num); +out: + if (ret) { + pr_warn("Unable to parse max pin pid num.\n"); + } else { + if (max_pin_pid_num > MAX_PIN_PID_NUM) { + max_pin_pid_num = 0; + pr_warn("Input max_pin_pid_num is too large.\n"); + } + } + return ret; +} +early_param("max_pin_pid_num", setup_max_pin_pid_num); + +static int __init setup_redirect_space_size(char *str) +{ + if (!str) + goto out; + + redirect_space_size = memparse(str, NULL); +out: + if (!redirect_space_size) { + pr_warn("Unable to parse redirect space size, use the default value.\n"); + redirect_space_size = DEFAULT_REDIRECT_SPACE_SIZE; + } + return 0; +} +early_param("redirect_space_size", setup_redirect_space_size); + +struct page_map_info *create_page_map_info(int pid) +{ + struct page_map_info *new; + + if (!user_space_reserve_start) + return NULL; + + if (pin_pid_num >= max_pin_pid_num) { + pr_warn("Pin pid num too large than max_pin_pid_num, fail create: %d!", pid); + return NULL; + } + new = (struct page_map_info *)(user_space_reserve_start + pin_pid_num); + new->pid = pid; + new->pme = NULL; + new->entry_num = 0; + new->pid_reserved = false; + new->disable_free_page = false; + (*pin_pid_num_addr)++; + pin_pid_num++; + return new; +} +EXPORT_SYMBOL_GPL(create_page_map_info); + +struct page_map_info *get_page_map_info(int pid) +{ + int i; + + if (!user_space_reserve_start) + return NULL; + + for (i = 0; i < pin_pid_num; i++) { + if (user_space_reserve_start[i].pid == pid) + return &(user_space_reserve_start[i]); + } + return NULL; +} +EXPORT_SYMBOL_GPL(get_page_map_info); + +static struct page *find_head_page(struct page *page) +{ + struct page *p = page; + + while (!PageBuddy(p)) { + if (PageLRU(p)) + return NULL; + p--; + } + return p; +} + +static void spilt_page_area_left(struct zone *zone, struct free_area *area, struct page *page, + unsigned long size, int order) +{ + unsigned long cur_size = 1 << order; + unsigned long total_size = 0; + + while (size && cur_size > size) { + cur_size >>= 1; + order--; + area--; + if (cur_size <= size) { + list_add(&page[total_size].lru, &area->free_list[MIGRATE_MOVABLE]); + atomic_set(&(page[total_size]._mapcount), PAGE_BUDDY_MAPCOUNT_VALUE); + set_page_private(&page[total_size], order); + set_pageblock_migratetype(&page[total_size], MIGRATE_MOVABLE); + area->nr_free++; + total_size += cur_size; + size -= cur_size; + } + } +} + +static void spilt_page_area_right(struct zone *zone, struct free_area *area, struct page *page, + unsigned long size, int order) +{ + unsigned long cur_size = 1 << order; + struct page *right_page, *head_page; + + right_page = page + size; + while (size && cur_size > size) { + cur_size >>= 1; + order--; + area--; + if (cur_size <= size) { + head_page = right_page - cur_size; + list_add(&head_page->lru, &area->free_list[MIGRATE_MOVABLE]); + atomic_set(&(head_page->_mapcount), PAGE_BUDDY_MAPCOUNT_VALUE); + set_page_private(head_page, order); + set_pageblock_migratetype(head_page, MIGRATE_MOVABLE); + area->nr_free++; + size -= cur_size; + right_page = head_page; + } + } +} + +void reserve_page_from_buddy(unsigned long nr_pages, struct page *page) +{ + unsigned int current_order; + struct page *page_end; + struct free_area *area; + struct zone *zone; + struct page *head_page; + + head_page = find_head_page(page); + if (!head_page) { + pr_warn("Find page head fail."); + return; + } + current_order = head_page->private; + page_end = head_page + (1 << current_order); + zone = page_zone(head_page); + area = &(zone->free_area[current_order]); + list_del(&head_page->lru); + atomic_set(&head_page->_mapcount, -1); + set_page_private(head_page, 0); + area->nr_free--; + if (head_page != page) + spilt_page_area_left(zone, area, head_page, + (unsigned long)(page - head_page), current_order); + page = page + nr_pages; + if (page < page_end) { + spilt_page_area_right(zone, area, page, + (unsigned long)(page_end - page), current_order); + } else if (page > page_end) { + pr_warn("Find page end smaller than page."); + } +} + +static inline void reserve_user_normal_pages(struct page *page) +{ + atomic_inc(&page->_refcount); + reserve_page_from_buddy(1, page); +} + +static void init_huge_pmd_pages(struct page *head_page) +{ + int i = 0; + struct page *page = head_page; + + __set_bit(PG_head, &page->flags); + __set_bit(PG_active, &page->flags); + atomic_set(&page->_refcount, 1); + page++; + i++; + page->compound_head = (unsigned long)head_page + 1; + page->compound_dtor = HUGETLB_PAGE_DTOR + 1; + page->compound_order = HPAGE_PMD_ORDER; + page++; + i++; + page->compound_head = (unsigned long)head_page + 1; + i++; + INIT_LIST_HEAD(&(page->deferred_list)); + for (; i < HPAGE_PMD_NR; i++) { + page = head_page + i; + page->compound_head = (unsigned long)head_page + 1; + } +} + +static inline void reserve_user_huge_pmd_pages(struct page *page) +{ + atomic_inc(&page->_refcount); + reserve_page_from_buddy((1 << HPAGE_PMD_ORDER), page); + init_huge_pmd_pages(page); +} + +int reserve_user_map_pages_fail; + +void free_user_map_pages(unsigned int pid_index, unsigned int entry_index, unsigned int page_index) +{ + unsigned int i, j, index, order; + struct page_map_info *pmi; + struct page_map_entry *pme; + struct page *page; + unsigned long phy_addr; + + for (index = 0; index < pid_index; index++) { + pmi = &(user_space_reserve_start[index]); + pme = pmi->pme; + for (i = 0; i < pmi->entry_num; i++) { + for (j = 0; j < pme->nr_pages; j++) { + order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0; + phy_addr = pme->phy_addr_array[j]; + if (phy_addr) { + page = phys_to_page(phy_addr); + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, order); + pme->phy_addr_array[j] = 0; + } + } + } + pme = (struct page_map_entry *)next_pme(pme); + } + } + pmi = &(user_space_reserve_start[index]); + pme = pmi->pme; + for (i = 0; i < entry_index; i++) { + for (j = 0; j < pme->nr_pages; j++) { + order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0; + phy_addr = pme->phy_addr_array[j]; + if (phy_addr) { + page = phys_to_page(phy_addr); + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, order); + pme->phy_addr_array[j] = 0; + } + } + } + pme = (struct page_map_entry *)next_pme(pme); + } + for (j = 0; j < page_index; j++) { + order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0; + phy_addr = pme->phy_addr_array[j]; + if (phy_addr) { + page = phys_to_page(phy_addr); + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, order); + pme->phy_addr_array[j] = 0; + } + } + } +} + +bool check_redirect_end_valid(struct redirect_info *redirect_start, + unsigned long max_redirect_page_num) +{ + unsigned long redirect_end; + + redirect_end = ((unsigned long)(redirect_start + 1) + + max_redirect_page_num * sizeof(unsigned int)); + if (redirect_end > redirect_space_start + redirect_space_size) + return false; + return false; +} + +static void reserve_user_space_map_pages(void) +{ + struct page_map_info *pmi; + struct page_map_entry *pme; + unsigned int i, j, index; + struct page *page; + unsigned long flags; + unsigned long phy_addr; + unsigned long redirect_pages = 0; + struct redirect_info *redirect_start = (struct redirect_info *)redirect_space_start; + + if (!user_space_reserve_start || !redirect_start) + return; + spin_lock_irqsave(&page_map_entry_lock, flags); + for (index = 0; index < pin_pid_num; index++) { + pmi = &(user_space_reserve_start[index]); + pme = pmi->pme; + for (i = 0; i < pmi->entry_num; i++) { + redirect_pages = 0; + if (!check_redirect_end_valid(redirect_start, pme->nr_pages)) + redirect_start = NULL; + for (j = 0; j < pme->nr_pages; j++) { + phy_addr = pme->phy_addr_array[j]; + if (!phy_addr) + continue; + page = phys_to_page(phy_addr); + if (atomic_read(&page->_refcount)) { + if ((page->flags & PAGE_FLAGS_CHECK_RESERVED) + && !pme->redirect_start) + pme->redirect_start = + (unsigned long)redirect_start; + if (redirect_start && + (page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + redirect_start->redirect_index[redirect_pages] = j; + redirect_pages++; + continue; + } else { + reserve_user_map_pages_fail = 1; + pr_warn("Page %pK refcount %d large than zero, no need reserve.\n", + page, atomic_read(&page->_refcount)); + goto free_pages; + } + } + if (!pme->is_huge_page) + reserve_user_normal_pages(page); + else + reserve_user_huge_pmd_pages(page); + } + pme = (struct page_map_entry *)next_pme(pme); + if (redirect_pages && redirect_start) { + redirect_start->redirect_pages = redirect_pages; + redirect_start = (struct redirect_info *)( + (unsigned long)(redirect_start + 1) + + redirect_start->redirect_pages * sizeof(unsigned int)); + } + } + } + spin_unlock(&page_map_entry_lock); + return; +free_pages: + free_user_map_pages(index, i, j); + spin_unlock(&page_map_entry_lock); +} + + +int calculate_pin_mem_digest(struct pin_mem_dump_info *pmdi, char *digest) +{ + int i; + struct sha256_state sctx; + + if (!digest) + digest = pmdi->sha_digest; + sha256_init(&sctx); + sha256_update(&sctx, (unsigned char *)(&(pmdi->magic)), + sizeof(struct pin_mem_dump_info) - SHA256_DIGEST_SIZE); + for (i = 0; i < pmdi->pin_pid_num; i++) { + sha256_update(&sctx, (unsigned char *)(&(pmdi->pmi_array[i])), + sizeof(struct page_map_info)); + } + sha256_final(&sctx, digest); + return 0; +} + +static int check_sha_digest(struct pin_mem_dump_info *pmdi) +{ + int ret = 0; + char digest[SHA256_DIGEST_SIZE] = {0}; + + ret = calculate_pin_mem_digest(pmdi, digest); + if (ret) { + pr_warn("calculate pin mem digest fail:%d\n", ret); + return ret; + } + if (memcmp(pmdi->sha_digest, digest, SHA256_DIGEST_SIZE)) { + pr_warn("pin mem dump info sha256 digest match error!\n"); + return -EFAULT; + } + return ret; +} + +/* + * The whole page map entry collect process must be Sequentially. + * The user_space_reserve_start points to the first page map info for + * the first dump task. And the page_map_entry_start points to + * the first page map entry of the first dump vma. + */ +static void init_page_map_info(struct pin_mem_dump_info *pmdi, unsigned long map_len) +{ + if (pin_mem_dump_start || !max_pin_pid_num) { + pr_warn("pin page map already init or max_pin_pid_num not set.\n"); + return; + } + if (map_len < sizeof(struct pin_mem_dump_info) + + max_pin_pid_num * sizeof(struct page_map_info) + redirect_space_size) { + pr_warn("pin memory reserved memblock too small.\n"); + return; + } + if ((pmdi->magic != PIN_MEM_DUMP_MAGIC) || (pmdi->pin_pid_num > max_pin_pid_num) || + check_sha_digest(pmdi)) + memset(pmdi, 0, sizeof(struct pin_mem_dump_info)); + pin_mem_dump_start = pmdi; + pin_pid_num = pmdi->pin_pid_num; + pr_info("pin_pid_num: %d\n", pin_pid_num); + pin_pid_num_addr = &(pmdi->pin_pid_num); + user_space_reserve_start = + (struct page_map_info *)pmdi->pmi_array; + page_map_entry_start = + (struct page_map_entry *)(user_space_reserve_start + max_pin_pid_num); + __page_map_entry_start = page_map_entry_start; + page_map_entry_end = (unsigned long)pmdi + map_len - redirect_space_size; + redirect_space_start = page_map_entry_end; + if (pin_pid_num > 0) + reserve_user_space_map_pages(); +} + +int finish_pin_mem_dump(void) +{ + int ret; + + if (!pin_mem_dump_start) + return -EFAULT; + pin_mem_dump_start->magic = PIN_MEM_DUMP_MAGIC; + memset(pin_mem_dump_start->sha_digest, 0, SHA256_DIGEST_SIZE); + ret = calculate_pin_mem_digest(pin_mem_dump_start, NULL); + if (ret) { + pr_warn("calculate pin mem digest fail:%d\n", ret); + return ret; + } + return ret; +} +EXPORT_SYMBOL_GPL(finish_pin_mem_dump); + +int collect_pmd_huge_pages(struct task_struct *task, + unsigned long start_addr, unsigned long end_addr, struct page_map_entry *pme) +{ + long res; + int index = 0; + unsigned long start = start_addr; + struct page *temp_page; + + while (start < end_addr) { + temp_page = NULL; + res = get_user_pages_remote(task->mm, start, 1, + FOLL_TOUCH | FOLL_GET, &temp_page, NULL, NULL); + if (!res) { + pr_warn("Get huge page for addr(%lx) fail.", start); + return COLLECT_PAGES_FAIL; + } + if (PageHead(temp_page)) { + start += HPAGE_PMD_SIZE; + pme->phy_addr_array[index] = page_to_phys(temp_page); + index++; + } else { + pme->nr_pages = index; + atomic_dec(&((temp_page)->_refcount)); + return COLLECT_PAGES_NEED_CONTINUE; + } + } + pme->nr_pages = index; + return COLLECT_PAGES_FINISH; +} + +int collect_normal_pages(struct task_struct *task, + unsigned long start_addr, unsigned long end_addr, struct page_map_entry *pme) +{ + int res; + unsigned long next; + unsigned long i, nr_pages; + struct page *tmp_page; + unsigned long *phy_addr_array = pme->phy_addr_array; + struct page **page_array = (struct page **)pme->phy_addr_array; + + next = (start_addr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE; + next = (next > end_addr) ? end_addr : next; + pme->nr_pages = 0; + while (start_addr < next) { + nr_pages = (PAGE_ALIGN(next) - start_addr) / PAGE_SIZE; + res = get_user_pages_remote(task->mm, start_addr, 1, + FOLL_TOUCH | FOLL_GET, &tmp_page, NULL, NULL); + if (!res) { + pr_warn("Get user page of %lx fail.\n", start_addr); + return COLLECT_PAGES_FAIL; + } + if (PageHead(tmp_page)) { + atomic_dec(&(tmp_page->_refcount)); + return COLLECT_PAGES_NEED_CONTINUE; + } + atomic_dec(&(tmp_page->_refcount)); + if (PageTail(tmp_page)) { + start_addr = next; + pme->virt_addr = start_addr; + next = (next + HPAGE_PMD_SIZE) > end_addr ? + end_addr : (next + HPAGE_PMD_SIZE); + continue; + } + res = get_user_pages_remote(task->mm, start_addr, nr_pages, + FOLL_TOUCH | FOLL_GET, page_array, NULL, NULL); + if (!res) { + pr_warn("Get user pages of %lx fail.\n", start_addr); + return COLLECT_PAGES_FAIL; + } + for (i = 0; i < nr_pages; i++) + phy_addr_array[i] = page_to_phys(page_array[i]); + pme->nr_pages += nr_pages; + page_array += nr_pages; + phy_addr_array += nr_pages; + start_addr = next; + next = (next + HPAGE_PMD_SIZE) > end_addr ? end_addr : (next + HPAGE_PMD_SIZE); + } + return COLLECT_PAGES_FINISH; +} + +/* Users make sure that the pin memory belongs to anonymous vma. */ +int pin_mem_area(struct task_struct *task, struct mm_struct *mm, + unsigned long start_addr, unsigned long end_addr) +{ + int pid, ret; + int is_huge_page = false; + unsigned int page_size; + unsigned long nr_pages, flags; + struct page_map_entry *pme; + struct page_map_info *pmi; + struct vm_area_struct *vma; + unsigned long i; + struct page *tmp_page; + + if (!page_map_entry_start + || !task || !mm + || start_addr >= end_addr) + return -EFAULT; + + pid = task->pid; + spin_lock_irqsave(&page_map_entry_lock, flags); + nr_pages = ((end_addr - start_addr) / PAGE_SIZE); + if ((unsigned long)page_map_entry_start + nr_pages * sizeof(struct page *) >= + page_map_entry_end) { + pr_warn("Page map entry use up!\n"); + ret = -EFAULT; + goto finish; + } + vma = find_extend_vma(mm, start_addr); + if (!vma) { + pr_warn("Find no match vma!\n"); + ret = -EFAULT; + goto finish; + } + if (start_addr == (start_addr & HPAGE_PMD_MASK) && + transparent_hugepage_enabled(vma)) { + page_size = HPAGE_PMD_SIZE; + is_huge_page = true; + } else { + page_size = PAGE_SIZE; + } + pme = page_map_entry_start; + pme->virt_addr = start_addr; + pme->redirect_start = 0; + pme->is_huge_page = is_huge_page; + memset(pme->phy_addr_array, 0, nr_pages * sizeof(unsigned long)); + down_write(&mm->mmap_lock); + if (!is_huge_page) { + ret = collect_normal_pages(task, start_addr, end_addr, pme); + if (ret != COLLECT_PAGES_FAIL && !pme->nr_pages) { + if (ret == COLLECT_PAGES_FINISH) { + ret = 0; + up_write(&mm->mmap_lock); + goto finish; + } + pme->is_huge_page = true; + page_size = HPAGE_PMD_SIZE; + ret = collect_pmd_huge_pages(task, pme->virt_addr, end_addr, pme); + } + } else { + ret = collect_pmd_huge_pages(task, start_addr, end_addr, pme); + if (ret != COLLECT_PAGES_FAIL && !pme->nr_pages) { + if (ret == COLLECT_PAGES_FINISH) { + ret = 0; + up_write(&mm->mmap_lock); + goto finish; + } + pme->is_huge_page = false; + page_size = PAGE_SIZE; + ret = collect_normal_pages(task, pme->virt_addr, end_addr, pme); + } + } + up_write(&mm->mmap_lock); + if (ret == COLLECT_PAGES_FAIL) { + ret = -EFAULT; + goto finish; + } + + /* check for zero pages */ + for (i = 0; i < pme->nr_pages; i++) { + tmp_page = phys_to_page(pme->phy_addr_array[i]); + if (!pme->is_huge_page) { + if (page_to_pfn(tmp_page) == my_zero_pfn(pme->virt_addr + i * PAGE_SIZE)) + pme->phy_addr_array[i] = 0; + } else if (is_huge_zero_page(tmp_page)) + pme->phy_addr_array[i] = 0; + } + + page_map_entry_start = (struct page_map_entry *)(next_pme(pme)); + pmi = get_page_map_info(pid); + if (!pmi) + pmi = create_page_map_info(pid); + if (!pmi) { + pr_warn("Create page map info fail for pid: %d!\n", pid); + ret = -EFAULT; + goto finish; + } + if (!pmi->pme) + pmi->pme = pme; + pmi->entry_num++; + spin_unlock_irqrestore(&page_map_entry_lock, flags); + if (ret == COLLECT_PAGES_NEED_CONTINUE) + ret = pin_mem_area(task, mm, pme->virt_addr + pme->nr_pages * page_size, end_addr); + return ret; +finish: + spin_unlock_irqrestore(&page_map_entry_lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(pin_mem_area); + +vm_fault_t remap_normal_pages(struct mm_struct *mm, struct vm_area_struct *vma, + struct page_map_entry *pme) +{ + int ret; + unsigned int j, i; + pgd_t *pgd; + p4d_t *p4d; + pmd_t *pmd; + pud_t *pud; + struct page *page, *new; + unsigned long address; + unsigned long phy_addr; + unsigned int redirect_pages = 0; + struct redirect_info *redirect_start; + + redirect_start = (struct redirect_info *)pme->redirect_start; + for (j = 0; j < pme->nr_pages; j++) { + address = pme->virt_addr + j * PAGE_SIZE; + phy_addr = pme->phy_addr_array[j]; + if (!phy_addr) + continue; + page = phys_to_page(phy_addr); + if (page_to_pfn(page) == my_zero_pfn(address)) { + pme->phy_addr_array[j] = 0; + continue; + } + pme->phy_addr_array[j] = 0; + if (redirect_start && (redirect_pages < redirect_start->redirect_pages) && + (j == redirect_start->redirect_index[redirect_pages])) { + new = alloc_zeroed_user_highpage_movable(vma, address); + if (!new) { + pr_warn("Redirect alloc page fail\n"); + continue; + } + copy_page(page_to_virt(new), phys_to_virt(phy_addr)); + page = new; + redirect_pages++; + } + page->mapping = NULL; + pgd = pgd_offset(mm, address); + p4d = p4d_alloc(mm, pgd, address); + if (!p4d) { + ret = VM_FAULT_OOM; + goto free; + } + pud = pud_alloc(mm, p4d, address); + if (!pud) { + ret = VM_FAULT_OOM; + goto free; + } + pmd = pmd_alloc(mm, pud, address); + if (!pmd) { + ret = VM_FAULT_OOM; + goto free; + } + ret = do_anon_page_remap(vma, address, pmd, page); + if (ret) + goto free; + } + return 0; +free: + for (i = j; i < pme->nr_pages; i++) { + phy_addr = pme->phy_addr_array[i]; + if (phy_addr) { + __free_page(phys_to_page(phy_addr)); + pme->phy_addr_array[i] = 0; + } + } + return ret; +} + +static inline gfp_t get_hugepage_gfpmask(struct vm_area_struct *vma) +{ + const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : + 0); + return GFP_TRANSHUGE_LIGHT; +} + +vm_fault_t remap_huge_pmd_pages(struct mm_struct *mm, struct vm_area_struct *vma, + struct page_map_entry *pme) +{ + int ret; + unsigned int j, i; + pgd_t *pgd; + p4d_t *p4d; + pmd_t *pmd; + pud_t *pud; + gfp_t gfp; + struct page *page, *new; + unsigned long address; + unsigned long phy_addr; + unsigned int redirect_pages = 0; + struct redirect_info *redirect_start; + + redirect_start = (struct redirect_info *)pme->redirect_start; + for (j = 0; j < pme->nr_pages; j++) { + address = pme->virt_addr + j * HPAGE_PMD_SIZE; + phy_addr = pme->phy_addr_array[j]; + if (!phy_addr) + continue; + page = phys_to_page(phy_addr); + if (is_huge_zero_page(page)) { + pme->phy_addr_array[j] = 0; + continue; + } + pme->phy_addr_array[j] = 0; + if (redirect_start && (redirect_pages < redirect_start->redirect_pages) && + (j == redirect_start->redirect_index[redirect_pages])) { + gfp = get_hugepage_gfpmask(vma); + new = alloc_hugepage_vma(gfp, vma, address, HPAGE_PMD_ORDER); + if (!new) { + pr_warn("Redirect alloc huge page fail\n"); + continue; + } + memcpy(page_to_virt(new), phys_to_virt(phy_addr), HPAGE_PMD_SIZE); + page = new; + redirect_pages++; + } + pgd = pgd_offset(mm, address); + p4d = p4d_alloc(mm, pgd, address); + if (!p4d) { + ret = VM_FAULT_OOM; + goto free; + } + pud = pud_alloc(mm, p4d, address); + if (!pud) { + ret = VM_FAULT_OOM; + goto free; + } + pmd = pmd_alloc(mm, pud, address); + if (!pmd) { + ret = VM_FAULT_OOM; + goto free; + } + ret = do_anon_huge_page_remap(vma, address, pmd, page); + if (ret) + goto free; + } + return 0; +free: + for (i = j; i < pme->nr_pages; i++) { + phy_addr = pme->phy_addr_array[i]; + if (phy_addr) { + page = phys_to_page(phy_addr); + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, HPAGE_PMD_ORDER); + pme->phy_addr_array[i] = 0; + } + } + } + return ret; +} + +static void free_unmap_pages(struct page_map_info *pmi, + struct page_map_entry *pme, + unsigned int index) +{ + unsigned int i, j; + unsigned long phy_addr; + unsigned int order; + struct page *page; + + pme = (struct page_map_entry *)(next_pme(pme)); + for (i = index; i < pmi->entry_num; i++) { + for (j = 0; j < pme->nr_pages; j++) { + phy_addr = pme->phy_addr_array[i]; + if (phy_addr) { + page = phys_to_page(phy_addr); + order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0; + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, order); + pme->phy_addr_array[i] = 0; + } + } + } + pme = (struct page_map_entry *)(next_pme(pme)); + } +} + +vm_fault_t do_mem_remap(int pid, struct mm_struct *mm) +{ + unsigned int i = 0; + vm_fault_t ret = 0; + struct vm_area_struct *vma; + struct page_map_info *pmi; + struct page_map_entry *pme; + unsigned long flags; + + if (reserve_user_map_pages_fail) + return -EFAULT; + pmi = get_page_map_info(pid); + if (!pmi) + return -EFAULT; + + spin_lock_irqsave(&page_map_entry_lock, flags); + pmi->disable_free_page = true; + spin_unlock(&page_map_entry_lock); + down_write(&mm->mmap_lock); + pme = pmi->pme; + vma = mm->mmap; + while ((i < pmi->entry_num) && (vma != NULL)) { + if (pme->virt_addr >= vma->vm_start && pme->virt_addr < vma->vm_end) { + i++; + if (!vma_is_anonymous(vma)) { + pme = (struct page_map_entry *)(next_pme(pme)); + continue; + } + if (!pme->is_huge_page) { + ret = remap_normal_pages(mm, vma, pme); + if (ret < 0) + goto free; + } else { + ret = remap_huge_pmd_pages(mm, vma, pme); + if (ret < 0) + goto free; + } + pme = (struct page_map_entry *)(next_pme(pme)); + } else { + vma = vma->vm_next; + } + } + up_write(&mm->mmap_lock); + return 0; +free: + free_unmap_pages(pmi, pme, i); + up_write(&mm->mmap_lock); + return ret; +} +EXPORT_SYMBOL_GPL(do_mem_remap); + +#if defined(CONFIG_ARM64) +void init_reserve_page_map(unsigned long map_addr, unsigned long map_size) +{ + void *addr; + + if (!map_addr || !map_size) + return; + addr = phys_to_virt(map_addr); + init_page_map_info((struct pin_mem_dump_info *)addr, map_size); +} +#else +void init_reserve_page_map(unsigned long map_addr, unsigned long map_size) +{ +} +#endif + +static void free_all_reserved_pages(void) +{ + unsigned int i, j, index, order; + struct page_map_info *pmi; + struct page_map_entry *pme; + struct page *page; + unsigned long phy_addr; + + if (!user_space_reserve_start || reserve_user_map_pages_fail) + return; + + for (index = 0; index < pin_pid_num; index++) { + pmi = &(user_space_reserve_start[index]); + if (pmi->disable_free_page) + continue; + pme = pmi->pme; + for (i = 0; i < pmi->entry_num; i++) { + for (j = 0; j < pme->nr_pages; j++) { + order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0; + phy_addr = pme->phy_addr_array[j]; + if (phy_addr) { + page = phys_to_page(phy_addr); + if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { + __free_pages(page, order); + pme->phy_addr_array[j] = 0; + } + } + } + pme = (struct page_map_entry *)next_pme(pme); + } + } +} + +/* Clear all pin memory record. */ +void clear_pin_memory_record(void) +{ + unsigned long flags; + + spin_lock_irqsave(&page_map_entry_lock, flags); + free_all_reserved_pages(); + if (pin_pid_num_addr) { + *pin_pid_num_addr = 0; + pin_pid_num = 0; + page_map_entry_start = __page_map_entry_start; + } + spin_unlock(&page_map_entry_lock); +} +EXPORT_SYMBOL_GPL(clear_pin_memory_record); + +#endif /* CONFIG_PIN_MEMORY */