提交 53f4e528 编写于 作者: D Daniel Jordan 提交者: Xie XiuQi

mm: change locked_vm's type from unsigned long to atomic_long_t

hulk inclusion
category: feature
bugzilla: 13228
CVE: NA
---------------------------

Currently, mmap_sem must be held as writer to modify the locked_vm field
in mm_struct.

This creates a bottleneck when multithreading VFIO page pinning because
each thread holds the mmap_sem as reader for the majority of the pinning
time but also takes mmap_sem as writer regularly, for short times, when
modifying locked_vm.

The problem gets worse when other workloads compete for CPU with ktask
threads doing page pinning because the other workloads force ktask
threads that hold mmap_sem as writer off the CPU, blocking ktask threads
trying to get mmap_sem as reader for an excessively long time (the
mmap_sem reader wait time grows linearly with the thread count).

Requiring mmap_sem for locked_vm also abuses mmap_sem by making it
protect data that could be synchronized separately.

So, decouple locked_vm from mmap_sem by making locked_vm an
atomic_long_t.  locked_vm's old type was unsigned long and changing it
to a signed type makes it lose half its capacity, but that's only a
concern for 32-bit systems and LONG_MAX * PAGE_SIZE is 8T on x86 in that
case, so there's headroom.

Now that mmap_sem is not taken as writer here, ktask threads holding
mmap_sem as reader can run more often.  Performance results appear later
in the series.

On powerpc, this was cross-compiled-tested only.

[XXX Can send separately.]
Signed-off-by: NDaniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: NHongbo Yao <yaohongbo@huawei.com>
Reviewed-by: NXie XiuQi <xiexiuqi@huawei.com>
Tested-by: NHongbo Yao <yaohongbo@huawei.com>
Signed-off-by: NYang Yingliang <yangyingliang@huawei.com>
上级 b0908eee
......@@ -58,33 +58,34 @@ static unsigned long kvmppc_stt_pages(unsigned long tce_pages)
static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
{
long ret = 0;
long locked_vm, ret = 0;
if (!current || !current->mm)
return ret; /* process exited */
down_write(&current->mm->mmap_sem);
locked_vm = atomic_long_read(&current->mm->locked_vm);
if (inc) {
unsigned long locked, lock_limit;
locked = current->mm->locked_vm + stt_pages;
locked = locked_vm + stt_pages;
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
ret = -ENOMEM;
else
current->mm->locked_vm += stt_pages;
atomic_long_add(stt_pages, &current->mm->locked_vm);
} else {
if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm))
stt_pages = current->mm->locked_vm;
if (WARN_ON_ONCE(stt_pages > locked_vm))
stt_pages = locked_vm;
current->mm->locked_vm -= stt_pages;
atomic_long_sub(stt_pages, &current->mm->locked_vm);
}
pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
inc ? '+' : '-',
stt_pages << PAGE_SHIFT,
current->mm->locked_vm << PAGE_SHIFT,
atomic_long_read(&current->mm->locked_vm) << PAGE_SHIFT,
rlimit(RLIMIT_MEMLOCK),
ret ? " - exceeded" : "");
......
......@@ -41,31 +41,31 @@ struct mm_iommu_table_group_mem_t {
static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
unsigned long npages, bool incr)
{
long ret = 0, locked, lock_limit;
long ret = 0, locked, lock_limit, locked_vm;
if (!npages)
return 0;
down_write(&mm->mmap_sem);
locked_vm = atomic_long_read(&mm->locked_vm);
if (incr) {
locked = mm->locked_vm + npages;
locked = locked_vm + npages;
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
ret = -ENOMEM;
else
mm->locked_vm += npages;
atomic_long_add(npages, &mm->locked_vm);
} else {
if (WARN_ON_ONCE(npages > mm->locked_vm))
npages = mm->locked_vm;
mm->locked_vm -= npages;
if (WARN_ON_ONCE(npages > locked_vm))
npages = locked_vm;
atomic_long_sub(npages, &mm->locked_vm);
}
pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
current ? current->pid : 0,
incr ? '+' : '-',
npages << PAGE_SHIFT,
mm->locked_vm << PAGE_SHIFT,
atomic_long_read(&mm->locked_vm) << PAGE_SHIFT,
rlimit(RLIMIT_MEMLOCK));
up_write(&mm->mmap_sem);
......
......@@ -45,6 +45,7 @@ void afu_dma_region_init(struct dfl_feature_platform_data *pdata)
static int afu_dma_adjust_locked_vm(struct device *dev, long npages, bool incr)
{
unsigned long locked, lock_limit;
long locked_vm;
int ret = 0;
/* the task is exiting. */
......@@ -53,24 +54,25 @@ static int afu_dma_adjust_locked_vm(struct device *dev, long npages, bool incr)
down_write(&current->mm->mmap_sem);
locked_vm = atomic_long_read(&current->mm->locked_vm);
if (incr) {
locked = current->mm->locked_vm + npages;
locked = locked_vm + npages;
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
ret = -ENOMEM;
else
current->mm->locked_vm += npages;
atomic_long_add(npages, &current->mm->locked_vm);
} else {
if (WARN_ON_ONCE(npages > current->mm->locked_vm))
npages = current->mm->locked_vm;
current->mm->locked_vm -= npages;
if (WARN_ON_ONCE(npages > locked_vm))
npages = locked_vm;
atomic_long_sub(npages, &current->mm->locked_vm);
}
dev_dbg(dev, "[%d] RLIMIT_MEMLOCK %c%ld %ld/%ld%s\n", current->pid,
incr ? '+' : '-', npages << PAGE_SHIFT,
current->mm->locked_vm << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK),
ret ? "- execeeded" : "");
atomic_long_read(&current->mm->locked_vm) << PAGE_SHIFT,
rlimit(RLIMIT_MEMLOCK), ret ? "- exceeded" : "");
up_write(&current->mm->mmap_sem);
......
......@@ -60,7 +60,7 @@ static void usnic_uiom_reg_account(struct work_struct *work)
struct usnic_uiom_reg, work);
down_write(&umem->mm->mmap_sem);
umem->mm->locked_vm -= umem->diff;
atomic_long_sub(umem->diff, &umem->mm->locked_vm);
up_write(&umem->mm->mmap_sem);
mmput(umem->mm);
kfree(umem);
......
......@@ -45,16 +45,16 @@ static long try_increment_locked_vm(struct mm_struct *mm, long npages)
return 0;
down_write(&mm->mmap_sem);
locked = mm->locked_vm + npages;
locked = atomic_long_read(&mm->locked_vm) + npages;
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
ret = -ENOMEM;
else
mm->locked_vm += npages;
atomic_long_add(npages, &mm->locked_vm);
pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
npages << PAGE_SHIFT,
mm->locked_vm << PAGE_SHIFT,
atomic_long_read(&mm->locked_vm) << PAGE_SHIFT,
rlimit(RLIMIT_MEMLOCK),
ret ? " - exceeded" : "");
......@@ -69,12 +69,12 @@ static void decrement_locked_vm(struct mm_struct *mm, long npages)
return;
down_write(&mm->mmap_sem);
if (WARN_ON_ONCE(npages > mm->locked_vm))
npages = mm->locked_vm;
mm->locked_vm -= npages;
if (WARN_ON_ONCE(npages > atomic_long_read(&mm->locked_vm)))
npages = atomic_long_read(&mm->locked_vm);
atomic_long_sub(npages, &mm->locked_vm);
pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
npages << PAGE_SHIFT,
mm->locked_vm << PAGE_SHIFT,
atomic_long_read(&mm->locked_vm) << PAGE_SHIFT,
rlimit(RLIMIT_MEMLOCK));
up_write(&mm->mmap_sem);
}
......
......@@ -292,13 +292,14 @@ static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
limit = task_rlimit(dma->task,
RLIMIT_MEMLOCK) >> PAGE_SHIFT;
if (mm->locked_vm + npage > limit)
if (atomic_long_read(&mm->locked_vm) + npage >
limit)
ret = -ENOMEM;
}
}
if (!ret)
mm->locked_vm += npage;
atomic_long_add(npage, &mm->locked_vm);
up_write(&mm->mmap_sem);
}
......@@ -435,7 +436,8 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
* pages are already counted against the user.
*/
if (!rsvd && !vfio_find_vpfn(dma, iova)) {
if (!dma->lock_cap && mm->locked_vm + 1 > limit) {
if (!dma->lock_cap &&
atomic_long_read(&mm->locked_vm) + 1 > limit) {
put_pfn(*pfn_base, dma->prot);
pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
limit << PAGE_SHIFT);
......@@ -461,8 +463,8 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
}
if (!rsvd && !vfio_find_vpfn(dma, iova)) {
if (!dma->lock_cap &&
mm->locked_vm + lock_acct + 1 > limit) {
if (!dma->lock_cap && atomic_long_read(&mm->locked_vm) +
lock_acct + 1 > limit) {
put_pfn(pfn, dma->prot);
pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
__func__, limit << PAGE_SHIFT);
......
......@@ -58,7 +58,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
swap = get_mm_counter(mm, MM_SWAPENTS);
SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
SEQ_PUT_DEC(" kB\nVmLck:\t", atomic_long_read(&mm->locked_vm));
SEQ_PUT_DEC(" kB\nVmPin:\t", mm->pinned_vm);
SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss);
SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss);
......
......@@ -405,7 +405,7 @@ struct mm_struct {
unsigned long hiwater_vm; /* High-water virtual memory usage */
unsigned long total_vm; /* Total pages mapped */
unsigned long locked_vm; /* Pages that have PG_mlocked set */
atomic_long_t locked_vm; /* Pages that have PG_mlocked set */
unsigned long pinned_vm; /* Refcount permanently increased */
unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
......
......@@ -992,7 +992,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm->core_state = NULL;
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
atomic_long_set(&mm->locked_vm, 0);
mm->pinned_vm = 0;
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
......
......@@ -151,7 +151,8 @@ void dump_mm(const struct mm_struct *mm)
atomic_read(&mm->mm_count),
mm_pgtables_bytes(mm),
mm->map_count,
mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
mm->hiwater_rss, mm->hiwater_vm, mm->total_vm,
atomic_long_read(&mm->locked_vm),
mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
mm->start_code, mm->end_code, mm->start_data, mm->end_data,
mm->start_brk, mm->brk, mm->start_stack,
......
......@@ -562,7 +562,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
nr_pages = -nr_pages;
else if (old_flags & VM_LOCKED)
nr_pages = 0;
mm->locked_vm += nr_pages;
atomic_long_add(nr_pages, &mm->locked_vm);
/*
* vm_flags is protected by the mmap_sem held in write mode.
......@@ -687,7 +687,7 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla
if (down_write_killable(&current->mm->mmap_sem))
return -EINTR;
locked += current->mm->locked_vm;
locked += atomic_long_read(&current->mm->locked_vm);
if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
/*
* It is possible that the regions requested intersect with
......
......@@ -1316,7 +1316,7 @@ static inline int mlock_future_check(struct mm_struct *mm,
/* mlock MCL_FUTURE? */
if (flags & VM_LOCKED) {
locked = len >> PAGE_SHIFT;
locked += mm->locked_vm;
locked += atomic_long_read(&mm->locked_vm);
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
......@@ -1802,7 +1802,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vma == get_gate_vma(current->mm))
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
else
mm->locked_vm += (len >> PAGE_SHIFT);
atomic_long_add(len >> PAGE_SHIFT, &mm->locked_vm);
}
if (file)
......@@ -2268,7 +2268,7 @@ static int acct_stack_growth(struct vm_area_struct *vma,
if (vma->vm_flags & VM_LOCKED) {
unsigned long locked;
unsigned long limit;
locked = mm->locked_vm + grow;
locked = atomic_long_read(&mm->locked_vm) + grow;
limit = rlimit(RLIMIT_MEMLOCK);
limit >>= PAGE_SHIFT;
if (locked > limit && !capable(CAP_IPC_LOCK))
......@@ -2362,7 +2362,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
*/
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
atomic_long_add(grow, &mm->locked_vm);
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
......@@ -2442,7 +2442,7 @@ int expand_downwards(struct vm_area_struct *vma,
*/
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
atomic_long_add(grow, &mm->locked_vm);
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
......@@ -2767,11 +2767,11 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
/*
* unlock any mlock()ed ranges before detaching vmas
*/
if (mm->locked_vm) {
if (atomic_long_read(&mm->locked_vm)) {
struct vm_area_struct *tmp = vma;
while (tmp && tmp->vm_start < end) {
if (tmp->vm_flags & VM_LOCKED) {
mm->locked_vm -= vma_pages(tmp);
atomic_long_sub(vma_pages(tmp), &mm->locked_vm);
munlock_vma_pages_all(tmp);
}
tmp = tmp->vm_next;
......@@ -3002,7 +3002,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
mm->total_vm += len >> PAGE_SHIFT;
mm->data_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT);
atomic_long_add(len >> PAGE_SHIFT, &mm->locked_vm);
vma->vm_flags |= VM_SOFTDIRTY;
return 0;
}
......@@ -3074,7 +3074,7 @@ void exit_mmap(struct mm_struct *mm)
up_write(&mm->mmap_sem);
}
if (mm->locked_vm) {
if (atomic_long_read(&mm->locked_vm)) {
vma = mm->mmap;
while (vma) {
if (vma->vm_flags & VM_LOCKED)
......
......@@ -360,7 +360,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
}
if (vm_flags & VM_LOCKED) {
mm->locked_vm += new_len >> PAGE_SHIFT;
atomic_long_add(new_len >> PAGE_SHIFT, &mm->locked_vm);
*locked = true;
}
......@@ -411,7 +411,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
if (vma->vm_flags & VM_LOCKED) {
unsigned long locked, lock_limit;
locked = mm->locked_vm << PAGE_SHIFT;
locked = atomic_long_read(&mm->locked_vm) << PAGE_SHIFT;
lock_limit = rlimit(RLIMIT_MEMLOCK);
locked += new_len - old_len;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
......@@ -591,7 +591,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
vm_stat_account(mm, vma->vm_flags, pages);
if (vma->vm_flags & VM_LOCKED) {
mm->locked_vm += pages;
atomic_long_add(pages, &mm->locked_vm);
locked = true;
new_addr = addr;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册