未验证 提交 9b7136fe 编写于 作者: O openeuler-ci-bot 提交者: Gitee

!59 add exec hugetlb support

Merge Pull Request from: @zhengzengkai 
 
Some applications has large code and data segment, and this causes high
TLB miss when page size is 4K. Mapping these apps into hugetlbfs will
boost their performance. A userspace tool libhugetlbfs can do this but
it seems to stopped development since 2020, and it does not support
debugging.
Tmpfs supports transparent hugepage, but it will be split into small
pages during copy-on-write. Userspace hotpatch will do mprotect on code
segment and trigger pmd split, and some users think performance
degradation after hotpatch is unacceptable.
This patch set implements hugetlb support in exec, and solved the
debugging issue. 
 
Link:https://gitee.com/openeuler/kernel/pulls/59 
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com> 
Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com> 
......@@ -6285,6 +6285,8 @@ CONFIG_HUGETLBFS=y
CONFIG_HUGETLB_PAGE=y
CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y
# CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON is not set
CONFIG_ENHANCED_HUGETLB_MMAP=y
CONFIG_EXEC_HUGETLB=y
CONFIG_MEMFD_CREATE=y
CONFIG_ARCH_HAS_GIGANTIC_PAGE=y
CONFIG_CONFIGFS_FS=y
......
......@@ -7373,6 +7373,8 @@ CONFIG_HUGETLB_PAGE=y
CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y
# CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON is not set
CONFIG_DYNAMIC_HUGETLB=y
CONFIG_ENHANCED_HUGETLB_MMAP=y
CONFIG_EXEC_HUGETLB=y
CONFIG_MEMFD_CREATE=y
CONFIG_ARCH_HAS_GIGANTIC_PAGE=y
CONFIG_CONFIGFS_FS=y
......
......@@ -270,6 +270,30 @@ config DYNAMIC_HUGETLB
pages automatically. The tasks in the memcg prefer to alloc dynamic
hugepage.
config ENHANCED_HUGETLB_MMAP
bool "enhanced hugetlb mmap"
default n
depends on HUGETLBFS
help
Add private file mmap for hugetlb.
This feature adds vm_actual_file in vma to record the original file and
copies file contents to hugetlb pages during page fault.
Procfs and perf record will show file name of vm_actual_file.
Hugetlb is useful for optimizing TLB miss rate, and this feature is
aimed to extend its usage.
config EXEC_HUGETLB
bool "use hugetlb in exec"
default n
depends on ENHANCED_HUGETLB_MMAP
help
Some applications suffer from high TLB miss, and users don't like
transparent hugepaged. (A background thread will affect overall
performance and madvise after exec is too late)
This feature provides another way to use huge page for apps by
using hugetlb map in exec.
Only support ELF format now.
config MEMFD_CREATE
def_bool TMPFS || HUGETLBFS
......
......@@ -357,6 +357,107 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
return 0;
}
#ifdef CONFIG_EXEC_HUGETLB
#define ELF_HPAGESIZE 0x200000
#define ELF_HPAGESTART(_v) ((_v) & ~(unsigned long)(ELF_HPAGESIZE - 1))
#define ELF_HPAGEOFFSET(_v) ((_v) & (ELF_HPAGESIZE - 1))
#define ELF_HPAGEALIGN(_v) (((_v) + ELF_HPAGESIZE - 1) & ~(ELF_HPAGESIZE - 1))
static int elf_hugetlb_bss(unsigned long bss, unsigned long brk, int prot,
int type)
{
unsigned long zero_byte = ELF_HPAGEOFFSET(bss);
struct user_struct *user = NULL;
struct file *huge_file;
int page_size_log = (MAP_HUGE_2MB >> MAP_HUGE_SHIFT)
& MAP_HUGE_MASK;
if (zero_byte) {
zero_byte = ELF_HPAGESIZE - zero_byte;
if (clear_user((void __user *) bss, zero_byte))
return -EFAULT;
}
bss = ELF_HPAGEALIGN(bss);
brk = ELF_HPAGEALIGN(brk);
if (brk > bss) {
unsigned long size = brk - bss;
huge_file = hugetlb_file_setup(HUGETLB_ANON_FILE, size,
VM_NORESERVE, &user, HUGETLB_ANONHUGE_INODE,
page_size_log);
if (IS_ERR(huge_file))
return -ENOMEM;
bss = vm_mmap(huge_file, bss, size, prot, type, 0);
if (BAD_ADDR(bss))
return -ENOMEM;
}
return 0;
}
static unsigned long elf_hugetlb_map(struct file *filep, unsigned long addr,
const struct elf_phdr *eppnt, int prot, int type,
unsigned long total_size)
{
unsigned long map_addr;
unsigned long elf_offset = ELF_PAGEOFFSET(eppnt->p_vaddr);
unsigned long size = eppnt->p_filesz + elf_offset;
unsigned long off = eppnt->p_offset - elf_offset;
int huge_flag = MAP_FILE_HUGETLB | MAP_HUGE_2MB;
if (eppnt->p_align != ELF_HPAGESIZE)
return -EINVAL;
if (total_size) {
total_size = ELF_HPAGEALIGN(total_size);
addr = vm_mmap(filep, addr, total_size,
PROT_NONE, type | huge_flag, 0);
if (BAD_ADDR(addr))
return -ENOMEM;
vm_munmap(addr, total_size);
}
addr = ELF_PAGESTART(addr);
map_addr = addr;
type |= MAP_FIXED_NOREPLACE;
/*
* Addr of relro segment is not aligned.
* Glibc will change the protection of this segment,
* so we use normal mmap to avoid mprotect alignment error.
*/
if (addr != ELF_HPAGESTART(addr)) {
unsigned long size_4k = ELF_HPAGEALIGN(addr) - addr;
addr = vm_mmap(filep, addr, size_4k, prot, type, off);
if (BAD_ADDR(addr))
return -ENOMEM;
size = ELF_PAGEALIGN(size) - size_4k;
size = ELF_HPAGEALIGN(size);
addr += size_4k;
off += size_4k;
} else {
size = ELF_HPAGEALIGN(size);
}
addr = vm_mmap(filep, addr, size, prot, type | huge_flag, off);
if (BAD_ADDR(addr))
return -ENOMEM;
if (eppnt->p_memsz > eppnt->p_filesz) {
addr = map_addr + elf_offset;
addr = elf_hugetlb_bss(addr + eppnt->p_filesz,
addr + eppnt->p_memsz, prot, type);
if (BAD_ADDR(addr))
return -ENOMEM;
}
return map_addr;
}
#endif
static unsigned long elf_map(struct file *filep, unsigned long addr,
const struct elf_phdr *eppnt, int prot, int type,
unsigned long total_size)
......@@ -372,6 +473,12 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
if (!size)
return addr;
#ifdef CONFIG_EXEC_HUGETLB
if (exec_hugetlb && (eppnt->p_flags & PF_HUGETLB))
return elf_hugetlb_map(filep, addr, eppnt, prot, type,
total_size);
#endif
/*
* total_size is the size of the ELF (interpreter) image.
* The _first_ mmap needs to know the full size, otherwise
......@@ -1196,6 +1303,14 @@ static int load_elf_binary(struct linux_binprm *bprm)
bss_prot = elf_prot;
elf_brk = k;
}
#ifdef CONFIG_EXEC_HUGETLB
/*
* bss is allocated in elf_hugetlb_bss,
* so skip vm_brk_flags in set_brk
*/
if (exec_hugetlb && (elf_ppnt->p_flags & PF_HUGETLB))
elf_bss = elf_brk = ELF_HPAGEALIGN(elf_brk);
#endif
}
e_entry = elf_ex->e_entry + load_bias;
......
......@@ -74,6 +74,10 @@
#include <trace/events/sched.h>
#ifdef CONFIG_EXEC_HUGETLB
int exec_hugetlb;
#endif
static int bprm_creds_from_file(struct linux_binprm *bprm);
int suid_dumpable = 0;
......
......@@ -280,6 +280,11 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
dev_t dev = 0;
const char *name = NULL;
#ifdef CONFIG_ENHANCED_HUGETLB_MMAP
if (vma->vm_actual_file)
file = vma->vm_actual_file;
#endif
if (file) {
struct inode *inode = file_inode(vma->vm_file);
dev = inode->i_sb->s_dev;
......
......@@ -99,4 +99,8 @@ static inline int arch_elf_adjust_prot(int prot,
}
#endif
#ifdef CONFIG_EXEC_HUGETLB
extern int exec_hugetlb;
#endif
#endif /* _LINUX_ELF_H */
......@@ -376,7 +376,11 @@ struct vm_area_struct {
#endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
#if defined(CONFIG_ENHANCED_HUGETLB_MMAP) && !defined(__GENKSYMS__)
KABI_USE(1, struct file *vm_actual_file);
#else
KABI_RESERVE(1)
#endif
KABI_RESERVE(2)
KABI_RESERVE(3)
KABI_RESERVE(4)
......
......@@ -31,6 +31,7 @@
#define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */
#define MAP_REPLACE 0x1000000
#define MAP_FILE_HUGETLB 0x2000000 /* hugetlb private file map support */
#define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be
* uninitialized */
......
......@@ -241,6 +241,7 @@ typedef struct elf64_hdr {
#define PF_R 0x4
#define PF_W 0x2
#define PF_X 0x1
#define PF_HUGETLB 0x1000000
typedef struct elf32_phdr{
Elf32_Word p_type;
......
......@@ -8120,6 +8120,13 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
flags |= MAP_LOCKED;
if (is_vm_hugetlb_page(vma))
flags |= MAP_HUGETLB;
#ifdef CONFIG_ENHANCED_HUGETLB_MMAP
if (vma->vm_actual_file) {
/* perf will ignore hugetlb vma, so remove this flag */
flags &= ~MAP_HUGETLB;
file = vma->vm_actual_file;
}
#endif
if (file) {
struct inode *inode;
......
......@@ -571,6 +571,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
i_mmap_unlock_write(mapping);
}
#ifdef CONFIG_ENHANCED_HUGETLB_MMAP
if (tmp->vm_actual_file)
get_file(tmp->vm_actual_file);
#endif
/*
* Clear hugetlb-related page reserves for children. This only
* affects MAP_PRIVATE mappings. Faults generated by the child
......
......@@ -3543,6 +3543,17 @@ static struct ctl_table fs_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ONE,
},
#ifdef CONFIG_EXEC_HUGETLB
{
.procname = "exec-use-hugetlb",
.data = &exec_hugetlb,
.maxlen = sizeof(exec_hugetlb),
.mode = 0600,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
#endif
{ }
};
......
......@@ -4536,6 +4536,20 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
i_mmap_unlock_write(mapping);
}
#ifdef CONFIG_ENHANCED_HUGETLB_MMAP
static int read_actual_file(struct page *page, struct vm_area_struct *vma,
loff_t *off, size_t size)
{
void *kaddr;
unsigned long read_size = 0;
kaddr = kmap(page);
read_size = kernel_read(vma->vm_actual_file, kaddr, size, off);
kunmap(page);
return IS_ERR_VALUE(read_size) ? read_size : 0;
}
#endif
/*
* Hugetlb_cow() should be called with page lock of the original hugepage held.
* Called with hugetlb_instantiation_mutex held and pte_page locked so we
......@@ -4837,6 +4851,17 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
goto out;
}
clear_huge_page(page, address, pages_per_huge_page(h));
#ifdef CONFIG_ENHANCED_HUGETLB_MMAP
if (vma->vm_actual_file) {
loff_t off = haddr - vma->vm_start
+ (vma->vm_pgoff << PAGE_SHIFT);
size_t page_size = huge_page_size(h);
ret = read_actual_file(page, vma, &off, page_size);
if (ret)
goto out;
}
#endif
__SetPageUptodate(page);
new_page = true;
......
......@@ -188,6 +188,10 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
vma->vm_ops->close(vma);
if (vma->vm_file)
fput(vma->vm_file);
#ifdef CONFIG_ENHANCED_HUGETLB_MMAP
if (vma->vm_actual_file)
fput(vma->vm_actual_file);
#endif
mpol_put(vma_policy(vma));
sp_area_drop(vma);
vm_area_free(vma);
......@@ -1849,6 +1853,17 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
return -EBADF;
if (is_file_hugepages(file)) {
len = ALIGN(len, huge_page_size(hstate_file(file)));
#ifdef CONFIG_ENHANCED_HUGETLB_MMAP
/*
* glibc can use this flag to load libraries,
* a similar feature of exec_hugetlb.
*/
} else if (unlikely(flags & MAP_FILE_HUGETLB)) {
if (!(flags & MAP_PRIVATE)) {
retval = -EINVAL;
goto out_fput;
}
#endif
} else if (unlikely(flags & MAP_HUGETLB)) {
retval = -EINVAL;
goto out_fput;
......@@ -3047,6 +3062,11 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
if (new->vm_file)
get_file(new->vm_file);
#ifdef CONFIG_ENHANCED_HUGETLB_MMAP
if (new->vm_actual_file)
get_file(new->vm_actual_file);
#endif
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);
......
......@@ -496,6 +496,31 @@ int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
}
EXPORT_SYMBOL_GPL(account_locked_vm);
#ifdef CONFIG_ENHANCED_HUGETLB_MMAP
static struct file *prepare_hugetlb_mmap(unsigned long flags, unsigned long size)
{
int page_size_log = (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK;
struct user_struct *user = NULL;
return hugetlb_file_setup(HUGETLB_ANON_FILE, size, VM_NORESERVE, &user,
HUGETLB_ANONHUGE_INODE, page_size_log);
}
static unsigned long finish_hugetlb_mmap(unsigned long addr, struct file *actual_file,
struct file *huge_file)
{
struct vm_area_struct *vma;
fput(huge_file);
vma = find_vma(current->mm, addr);
if (!vma)
return -EINVAL;
vma->vm_actual_file = get_file(actual_file);
return addr;
}
#endif
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff)
......@@ -504,13 +529,28 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
struct mm_struct *mm = current->mm;
unsigned long populate;
LIST_HEAD(uf);
#ifdef CONFIG_ENHANCED_HUGETLB_MMAP
struct file *actual_file = NULL;
#endif
ret = security_mmap_file(file, prot, flag);
#ifdef CONFIG_ENHANCED_HUGETLB_MMAP
if (flag & MAP_FILE_HUGETLB) {
actual_file = file;
file = prepare_hugetlb_mmap(flag, len + (pgoff << PAGE_SHIFT));
if (IS_ERR(file))
return PTR_ERR(file);
}
#endif
if (!ret) {
if (mmap_write_lock_killable(mm))
return -EINTR;
ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate,
&uf);
#ifdef CONFIG_ENHANCED_HUGETLB_MMAP
if (!IS_ERR_VALUE(addr) && (flag & MAP_FILE_HUGETLB))
ret = finish_hugetlb_mmap(ret, actual_file, file);
#endif
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册