提交 4733c59f 编写于 作者: D Daniel Jordan 提交者: Xie XiuQi

hugetlbfs: parallelize hugetlbfs_fallocate with ktask

hulk inclusion
category: feature
bugzilla: 13228
CVE: NA
---------------------------

hugetlbfs_fallocate preallocates huge pages to back a file in a
hugetlbfs filesystem.  The time to call this function grows linearly
with size.

ktask performs well with its default thread count of 4; higher thread
counts are given for context only.

Machine: Intel(R) Xeon(R) CPU E7-8895 v3 @ 2.60GHz, 288 CPUs, 1T memory
Test:    fallocate(1) a file on a hugetlbfs filesystem

nthread   speedup   size (GiB)   min time (s)   stdev
      1                    200         127.53    2.19
      2     3.09x          200          41.30    2.11
      4     5.72x          200          22.29    0.51
      8     9.45x          200          13.50    2.58
     16     9.74x          200          13.09    1.64

      1                    400         193.09    2.47
      2     2.14x          400          90.31    3.39
      4     3.84x          400          50.32    0.44
      8     5.11x          400          37.75    1.23
     16     6.12x          400          31.54    3.13

The primary bottleneck for better scaling at higher thread counts is
hugetlb_fault_mutex_table[hash].  perf showed L1-dcache-loads increase
with 8 threads and again sharply with 16 threads, and a CPU counter
profile showed that 31% of the L1d misses were on
hugetlb_fault_mutex_table[hash] in the 16-thread case.
Signed-off-by: NDaniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: NHongbo Yao <yaohongbo@huawei.com>
Reviewed-by: NXie XiuQi <xiexiuqi@huawei.com>
Tested-by: NHongbo Yao <yaohongbo@huawei.com>
Signed-off-by: NYang Yingliang <yangyingliang@huawei.com>
上级 ae0cd4d4
...@@ -36,6 +36,7 @@ ...@@ -36,6 +36,7 @@
#include <linux/magic.h> #include <linux/magic.h>
#include <linux/migrate.h> #include <linux/migrate.h>
#include <linux/uio.h> #include <linux/uio.h>
#include <linux/ktask.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
...@@ -76,11 +77,16 @@ static const match_table_t tokens = { ...@@ -76,11 +77,16 @@ static const match_table_t tokens = {
}; };
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
static inline struct shared_policy *hugetlb_get_shared_policy(
struct inode *inode)
{
return &HUGETLBFS_I(inode)->policy;
}
static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma, static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
struct inode *inode, pgoff_t index) struct shared_policy *policy, pgoff_t index)
{ {
vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy, vma->vm_policy = mpol_shared_policy_lookup(policy, index);
index);
} }
static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
...@@ -88,8 +94,14 @@ static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) ...@@ -88,8 +94,14 @@ static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
mpol_cond_put(vma->vm_policy); mpol_cond_put(vma->vm_policy);
} }
#else #else
static inline struct shared_policy *hugetlb_get_shared_policy(
struct inode *inode)
{
return NULL;
}
static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma, static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
struct inode *inode, pgoff_t index) struct shared_policy *policy, pgoff_t index)
{ {
} }
...@@ -553,20 +565,30 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) ...@@ -553,20 +565,30 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
return 0; return 0;
} }
struct hf_args {
struct file *file;
struct task_struct *parent_task;
struct mm_struct *mm;
struct shared_policy *shared_policy;
struct hstate *hstate;
struct address_space *mapping;
int error;
};
static int hugetlbfs_fallocate_chunk(pgoff_t start, pgoff_t end,
struct hf_args *args);
static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
loff_t len) loff_t len)
{ {
struct inode *inode = file_inode(file); struct inode *inode = file_inode(file);
struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
struct address_space *mapping = inode->i_mapping;
struct hstate *h = hstate_inode(inode); struct hstate *h = hstate_inode(inode);
struct vm_area_struct pseudo_vma;
struct mm_struct *mm = current->mm;
loff_t hpage_size = huge_page_size(h); loff_t hpage_size = huge_page_size(h);
unsigned long hpage_shift = huge_page_shift(h); unsigned long hpage_shift = huge_page_shift(h);
pgoff_t start, index, end; pgoff_t start, end;
struct hf_args hf_args;
int error; int error;
u32 hash;
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP; return -EOPNOTSUPP;
...@@ -595,16 +617,66 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, ...@@ -595,16 +617,66 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
goto out; goto out;
} }
hf_args.file = file;
hf_args.parent_task = current;
hf_args.mm = current->mm;
hf_args.shared_policy = hugetlb_get_shared_policy(inode);
hf_args.hstate = h;
hf_args.mapping = inode->i_mapping;
hf_args.error = 0;
if (unlikely(hstate_is_gigantic(h))) {
/*
* Use multiple threads in clear_gigantic_page instead of here,
* so just do a 1-threaded hugetlbfs_fallocate_chunk.
*/
error = hugetlbfs_fallocate_chunk(start, end, &hf_args);
} else {
DEFINE_KTASK_CTL(ctl, hugetlbfs_fallocate_chunk,
&hf_args, KTASK_PMD_MINCHUNK);
error = ktask_run((void *)start, end - start, &ctl);
}
if (error != KTASK_RETURN_SUCCESS && hf_args.error != -EINTR)
goto out;
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
i_size_write(inode, offset + len);
inode->i_ctime = current_time(inode);
out:
inode_unlock(inode);
return error;
}
static int hugetlbfs_fallocate_chunk(pgoff_t start, pgoff_t end,
struct hf_args *args)
{
struct file *file = args->file;
struct task_struct *parent_task = args->parent_task;
struct mm_struct *mm = args->mm;
struct shared_policy *shared_policy = args->shared_policy;
struct hstate *h = args->hstate;
struct address_space *mapping = args->mapping;
int error = 0;
pgoff_t index;
struct vm_area_struct pseudo_vma;
loff_t hpage_size;
u32 hash;
hpage_size = huge_page_size(h);
/* /*
* Initialize a pseudo vma as this is required by the huge page * Initialize a pseudo vma as this is required by the huge page
* allocation routines. If NUMA is configured, use page index * allocation routines. If NUMA is configured, use page index
* as input to create an allocation policy. * as input to create an allocation policy. Each thread gets its
* own pseudo vma because mempolicies can differ by page.
*/ */
vma_init(&pseudo_vma, mm); vma_init(&pseudo_vma, mm);
pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
pseudo_vma.vm_file = file; pseudo_vma.vm_file = file;
for (index = start; index < end; index++) { for (index = start; index < end; ++index) {
/* /*
* This is supposed to be the vaddr where the page is being * This is supposed to be the vaddr where the page is being
* faulted in, but we have no vaddr here. * faulted in, but we have no vaddr here.
...@@ -619,13 +691,13 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, ...@@ -619,13 +691,13 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
* fallocate(2) manpage permits EINTR; we may have been * fallocate(2) manpage permits EINTR; we may have been
* interrupted because we are using up too much memory. * interrupted because we are using up too much memory.
*/ */
if (signal_pending(current)) { if (signal_pending(parent_task) || signal_pending(current)) {
error = -EINTR; error = -EINTR;
break; goto err;
} }
/* Set numa allocation policy based on index */ /* Set numa allocation policy based on index */
hugetlb_set_vma_policy(&pseudo_vma, inode, index); hugetlb_set_vma_policy(&pseudo_vma, shared_policy, index);
/* addr is the offset within the file (zero based) */ /* addr is the offset within the file (zero based) */
addr = index * hpage_size; addr = index * hpage_size;
...@@ -649,7 +721,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, ...@@ -649,7 +721,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
if (IS_ERR(page)) { if (IS_ERR(page)) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]); mutex_unlock(&hugetlb_fault_mutex_table[hash]);
error = PTR_ERR(page); error = PTR_ERR(page);
goto out; goto err;
} }
clear_huge_page(page, addr, pages_per_huge_page(h)); clear_huge_page(page, addr, pages_per_huge_page(h));
__SetPageUptodate(page); __SetPageUptodate(page);
...@@ -657,7 +729,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, ...@@ -657,7 +729,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
if (unlikely(error)) { if (unlikely(error)) {
put_page(page); put_page(page);
mutex_unlock(&hugetlb_fault_mutex_table[hash]); mutex_unlock(&hugetlb_fault_mutex_table[hash]);
goto out; goto err;
} }
mutex_unlock(&hugetlb_fault_mutex_table[hash]); mutex_unlock(&hugetlb_fault_mutex_table[hash]);
...@@ -670,11 +742,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, ...@@ -670,11 +742,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
put_page(page); put_page(page);
} }
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) return KTASK_RETURN_SUCCESS;
i_size_write(inode, offset + len);
inode->i_ctime = current_time(inode); err:
out: args->error = error;
inode_unlock(inode);
return error; return error;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册