提交 b37a05c0 编写于 作者: L Linus Torvalds

Merge branch 'akpm' (patches from Andrew)

Merge fixes from Andrew Morton:
 "18 fixes"

[ The 18 fixes turned into 17 commits, because one of the fixes was a
  fix for another patch in the series that I just folded in by editing
  the patch manually - hopefully correctly     - Linus ]

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  mm: fix memory leak in copy_huge_pmd()
  drivers/hwspinlock: fix race between radix tree insertion and lookup
  radix-tree: fix race in gang lookup
  mm/vmpressure.c: fix subtree pressure detection
  mm: polish virtual memory accounting
  mm: warn about VmData over RLIMIT_DATA
  Documentation: cgroup-v2: add memory.stat::sock description
  mm: memcontrol: drop superfluous entry in the per-memcg stats array
  drivers/scsi/sg.c: mark VMA as VM_IO to prevent migration
  proc: revert /proc/<pid>/maps [stack:TID] annotation
  numa: fix /proc/<pid>/numa_maps for hugetlbfs on s390
  MAINTAINERS: update Seth email
  ocfs2/cluster: fix memory leak in o2hb_region_release
  lib/test-string_helpers.c: fix and improve string_get_size() tests
  thp: limit number of object to scan on deferred_split_scan()
  thp: change deferred_split_count() to return number of THP in queue
  thp: make split_queue per-node
......@@ -843,6 +843,10 @@ PAGE_SIZE multiple when read back.
Amount of memory used to cache filesystem data,
including tmpfs and shared memory.
sock
Amount of memory used in network transmission buffers
file_mapped
Amount of cached filesystem data mapped with mmap()
......
......@@ -240,8 +240,8 @@ Table 1-2: Contents of the status files (as of 4.1)
RssFile size of resident file mappings
RssShmem size of resident shmem memory (includes SysV shm,
mapping of tmpfs and shared anonymous mappings)
VmData size of data, stack, and text segments
VmStk size of data, stack, and text segments
VmData size of private data segments
VmStk size of stack segments
VmExe size of text segment
VmLib size of shared library code
VmPTE size of page table entries
......@@ -356,7 +356,7 @@ address perms offset dev inode pathname
a7cb1000-a7cb2000 ---p 00000000 00:00 0
a7cb2000-a7eb2000 rw-p 00000000 00:00 0
a7eb2000-a7eb3000 ---p 00000000 00:00 0
a7eb3000-a7ed5000 rw-p 00000000 00:00 0 [stack:1001]
a7eb3000-a7ed5000 rw-p 00000000 00:00 0
a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6
a8008000-a800a000 r--p 00133000 03:00 4222 /lib/libc.so.6
a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/libc.so.6
......@@ -388,7 +388,6 @@ is not associated with a file:
[heap] = the heap of the program
[stack] = the stack of the main process
[stack:1001] = the stack of the thread with tid 1001
[vdso] = the "virtual dynamic shared object",
the kernel system call handler
......@@ -396,10 +395,8 @@ is not associated with a file:
The /proc/PID/task/TID/maps is a view of the virtual memory from the viewpoint
of the individual tasks of a process. In this file you will see a mapping marked
as [stack] if that task sees it as a stack. This is a key difference from the
content of /proc/PID/maps, where you will see all mappings that are being used
as stack by all of those tasks. Hence, for the example above, the task-level
map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this:
as [stack] if that task sees it as a stack. Hence, for the example above, the
task-level map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this:
08048000-08049000 r-xp 00000000 03:00 8312 /opt/test
08049000-0804a000 rw-p 00001000 03:00 8312 /opt/test
......
......@@ -1496,6 +1496,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
could change it dynamically, usually by
/sys/module/printk/parameters/ignore_loglevel.
ignore_rlimit_data
Ignore RLIMIT_DATA setting for data mappings,
print warning at first misuse. Can be changed via
/sys/module/kernel/parameters/ignore_rlimit_data.
ihash_entries= [KNL]
Set number of hash buckets for inode cache.
......
......@@ -12150,7 +12150,7 @@ F: drivers/net/hamradio/*scc.c
F: drivers/net/hamradio/z8530.h
ZBUD COMPRESSED PAGE ALLOCATOR
M: Seth Jennings <sjennings@variantweb.net>
M: Seth Jennings <sjenning@redhat.com>
L: linux-mm@kvack.org
S: Maintained
F: mm/zbud.c
......@@ -12205,7 +12205,7 @@ F: include/linux/zsmalloc.h
F: Documentation/vm/zsmalloc.txt
ZSWAP COMPRESSED SWAP CACHING
M: Seth Jennings <sjennings@variantweb.net>
M: Seth Jennings <sjenning@redhat.com>
L: linux-mm@kvack.org
S: Maintained
F: mm/zswap.c
......
......@@ -313,6 +313,10 @@ int of_hwspin_lock_get_id(struct device_node *np, int index)
hwlock = radix_tree_deref_slot(slot);
if (unlikely(!hwlock))
continue;
if (radix_tree_is_indirect_ptr(hwlock)) {
slot = radix_tree_iter_retry(&iter);
continue;
}
if (hwlock->bank->dev->of_node == args.np) {
ret = 0;
......
......@@ -1261,7 +1261,7 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma)
}
sfp->mmap_called = 1;
vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_private_data = sfp;
vma->vm_ops = &sg_mmap_vm_ops;
return 0;
......
......@@ -1254,15 +1254,15 @@ static const struct file_operations o2hb_debug_fops = {
void o2hb_exit(void)
{
kfree(o2hb_db_livenodes);
kfree(o2hb_db_liveregions);
kfree(o2hb_db_quorumregions);
kfree(o2hb_db_failedregions);
debugfs_remove(o2hb_debug_failedregions);
debugfs_remove(o2hb_debug_quorumregions);
debugfs_remove(o2hb_debug_liveregions);
debugfs_remove(o2hb_debug_livenodes);
debugfs_remove(o2hb_debug_dir);
kfree(o2hb_db_livenodes);
kfree(o2hb_db_liveregions);
kfree(o2hb_db_quorumregions);
kfree(o2hb_db_failedregions);
}
static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
......@@ -1438,13 +1438,15 @@ static void o2hb_region_release(struct config_item *item)
kfree(reg->hr_slots);
kfree(reg->hr_db_regnum);
kfree(reg->hr_db_livenodes);
debugfs_remove(reg->hr_debug_livenodes);
debugfs_remove(reg->hr_debug_regnum);
debugfs_remove(reg->hr_debug_elapsed_time);
debugfs_remove(reg->hr_debug_pinned);
debugfs_remove(reg->hr_debug_dir);
kfree(reg->hr_db_livenodes);
kfree(reg->hr_db_regnum);
kfree(reg->hr_debug_elapsed_time);
kfree(reg->hr_debug_pinned);
spin_lock(&o2hb_live_lock);
list_del(&reg->hr_all_item);
......
......@@ -259,23 +259,29 @@ static int do_maps_open(struct inode *inode, struct file *file,
sizeof(struct proc_maps_private));
}
static pid_t pid_of_stack(struct proc_maps_private *priv,
struct vm_area_struct *vma, bool is_pid)
/*
* Indicate if the VMA is a stack for the given task; for
* /proc/PID/maps that is the stack of the main task.
*/
static int is_stack(struct proc_maps_private *priv,
struct vm_area_struct *vma, int is_pid)
{
int stack = 0;
if (is_pid) {
stack = vma->vm_start <= vma->vm_mm->start_stack &&
vma->vm_end >= vma->vm_mm->start_stack;
} else {
struct inode *inode = priv->inode;
struct task_struct *task;
pid_t ret = 0;
rcu_read_lock();
task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (task) {
task = task_of_stack(task, vma, is_pid);
if (task)
ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
}
stack = vma_is_stack_for_task(vma, task);
rcu_read_unlock();
return ret;
}
return stack;
}
static void
......@@ -335,8 +341,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
name = arch_vma_name(vma);
if (!name) {
pid_t tid;
if (!mm) {
name = "[vdso]";
goto done;
......@@ -348,21 +352,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
goto done;
}
tid = pid_of_stack(priv, vma, is_pid);
if (tid != 0) {
/*
* Thread stack in /proc/PID/task/TID/maps or
* the main process stack.
*/
if (!is_pid || (vma->vm_start <= mm->start_stack &&
vma->vm_end >= mm->start_stack)) {
if (is_stack(priv, vma, is_pid))
name = "[stack]";
} else {
/* Thread stack in /proc/PID/maps */
seq_pad(m, ' ');
seq_printf(m, "[stack:%d]", tid);
}
}
}
done:
......@@ -1552,18 +1543,19 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end, struct mm_walk *walk)
{
pte_t huge_pte = huge_ptep_get(pte);
struct numa_maps *md;
struct page *page;
if (!pte_present(*pte))
if (!pte_present(huge_pte))
return 0;
page = pte_page(*pte);
page = pte_page(huge_pte);
if (!page)
return 0;
md = walk->private;
gather_stats(page, md, pte_dirty(*pte), 1);
gather_stats(page, md, pte_dirty(huge_pte), 1);
return 0;
}
......@@ -1617,19 +1609,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
seq_file_path(m, file, "\n\t= ");
} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
seq_puts(m, " heap");
} else {
pid_t tid = pid_of_stack(proc_priv, vma, is_pid);
if (tid != 0) {
/*
* Thread stack in /proc/PID/task/TID/maps or
* the main process stack.
*/
if (!is_pid || (vma->vm_start <= mm->start_stack &&
vma->vm_end >= mm->start_stack))
} else if (is_stack(proc_priv, vma, is_pid)) {
seq_puts(m, " stack");
else
seq_printf(m, " stack:%d", tid);
}
}
if (is_vm_hugetlb_page(vma))
......
......@@ -123,23 +123,26 @@ unsigned long task_statm(struct mm_struct *mm,
return size;
}
static pid_t pid_of_stack(struct proc_maps_private *priv,
struct vm_area_struct *vma, bool is_pid)
static int is_stack(struct proc_maps_private *priv,
struct vm_area_struct *vma, int is_pid)
{
struct mm_struct *mm = vma->vm_mm;
int stack = 0;
if (is_pid) {
stack = vma->vm_start <= mm->start_stack &&
vma->vm_end >= mm->start_stack;
} else {
struct inode *inode = priv->inode;
struct task_struct *task;
pid_t ret = 0;
rcu_read_lock();
task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (task) {
task = task_of_stack(task, vma, is_pid);
if (task)
ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
}
stack = vma_is_stack_for_task(vma, task);
rcu_read_unlock();
return ret;
}
return stack;
}
/*
......@@ -181,21 +184,9 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
if (file) {
seq_pad(m, ' ');
seq_file_path(m, file, "");
} else if (mm) {
pid_t tid = pid_of_stack(priv, vma, is_pid);
if (tid != 0) {
} else if (mm && is_stack(priv, vma, is_pid)) {
seq_pad(m, ' ');
/*
* Thread stack in /proc/PID/task/TID/maps or
* the main process stack.
*/
if (!is_pid || (vma->vm_start <= mm->start_stack &&
vma->vm_end >= mm->start_stack))
seq_printf(m, "[stack]");
else
seq_printf(m, "[stack:%d]", tid);
}
}
seq_putc(m, '\n');
......
......@@ -51,7 +51,7 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
MEM_CGROUP_STAT_NSTATS,
/* default hierarchy stats */
MEMCG_SOCK,
MEMCG_SOCK = MEM_CGROUP_STAT_NSTATS,
MEMCG_NR_STAT,
};
......
......@@ -201,11 +201,13 @@ extern unsigned int kobjsize(const void *objp);
#endif
#ifdef CONFIG_STACK_GROWSUP
#define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
#define VM_STACK VM_GROWSUP
#else
#define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
#define VM_STACK VM_GROWSDOWN
#endif
#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
/*
* Special vmas that are non-mergable, non-mlock()able.
* Note: mm/huge_memory.c VM_NO_THP depends on this definition.
......@@ -1341,8 +1343,7 @@ static inline int stack_guard_page_end(struct vm_area_struct *vma,
!vma_growsup(vma->vm_next, addr);
}
extern struct task_struct *task_of_stack(struct task_struct *task,
struct vm_area_struct *vma, bool in_group);
int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t);
extern unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
......
......@@ -424,9 +424,9 @@ struct mm_struct {
unsigned long total_vm; /* Total pages mapped */
unsigned long locked_vm; /* Pages that have PG_mlocked set */
unsigned long pinned_vm; /* Refcount permanently increased */
unsigned long data_vm; /* VM_WRITE & ~VM_SHARED/GROWSDOWN */
unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE */
unsigned long stack_vm; /* VM_GROWSUP/DOWN */
unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
unsigned long stack_vm; /* VM_STACK */
unsigned long def_flags;
unsigned long start_code, end_code, start_data, end_data;
unsigned long start_brk, brk, start_stack;
......
......@@ -682,6 +682,12 @@ typedef struct pglist_data {
*/
unsigned long first_deferred_pfn;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
spinlock_t split_queue_lock;
struct list_head split_queue;
unsigned long split_queue_len;
#endif
} pg_data_t;
#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
......
......@@ -378,6 +378,22 @@ radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start)
void **radix_tree_next_chunk(struct radix_tree_root *root,
struct radix_tree_iter *iter, unsigned flags);
/**
* radix_tree_iter_retry - retry this chunk of the iteration
* @iter: iterator state
*
* If we iterate over a tree protected only by the RCU lock, a race
* against deletion or creation may result in seeing a slot for which
* radix_tree_deref_retry() returns true. If so, call this function
* and continue the iteration.
*/
static inline __must_check
void **radix_tree_iter_retry(struct radix_tree_iter *iter)
{
iter->next_index = iter->index;
return NULL;
}
/**
* radix_tree_chunk_size - get current chunk size
*
......
......@@ -1019,9 +1019,13 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
return 0;
radix_tree_for_each_slot(slot, root, &iter, first_index) {
results[ret] = indirect_to_ptr(rcu_dereference_raw(*slot));
results[ret] = rcu_dereference_raw(*slot);
if (!results[ret])
continue;
if (radix_tree_is_indirect_ptr(results[ret])) {
slot = radix_tree_iter_retry(&iter);
continue;
}
if (++ret == max_items)
break;
}
......@@ -1098,9 +1102,13 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
return 0;
radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) {
results[ret] = indirect_to_ptr(rcu_dereference_raw(*slot));
results[ret] = rcu_dereference_raw(*slot);
if (!results[ret])
continue;
if (radix_tree_is_indirect_ptr(results[ret])) {
slot = radix_tree_iter_retry(&iter);
continue;
}
if (++ret == max_items)
break;
}
......
......@@ -327,36 +327,67 @@ static __init void test_string_escape(const char *name,
}
#define string_get_size_maxbuf 16
#define test_string_get_size_one(size, blk_size, units, exp_result) \
#define test_string_get_size_one(size, blk_size, exp_result10, exp_result2) \
do { \
BUILD_BUG_ON(sizeof(exp_result) >= string_get_size_maxbuf); \
__test_string_get_size((size), (blk_size), (units), \
(exp_result)); \
BUILD_BUG_ON(sizeof(exp_result10) >= string_get_size_maxbuf); \
BUILD_BUG_ON(sizeof(exp_result2) >= string_get_size_maxbuf); \
__test_string_get_size((size), (blk_size), (exp_result10), \
(exp_result2)); \
} while (0)
static __init void __test_string_get_size(const u64 size, const u64 blk_size,
const enum string_size_units units,
const char *exp_result)
static __init void test_string_get_size_check(const char *units,
const char *exp,
char *res,
const u64 size,
const u64 blk_size)
{
char buf[string_get_size_maxbuf];
string_get_size(size, blk_size, units, buf, sizeof(buf));
if (!memcmp(buf, exp_result, strlen(exp_result) + 1))
if (!memcmp(res, exp, strlen(exp) + 1))
return;
buf[sizeof(buf) - 1] = '\0';
pr_warn("Test 'test_string_get_size_one' failed!\n");
pr_warn("string_get_size(size = %llu, blk_size = %llu, units = %d\n",
res[string_get_size_maxbuf - 1] = '\0';
pr_warn("Test 'test_string_get_size' failed!\n");
pr_warn("string_get_size(size = %llu, blk_size = %llu, units = %s)\n",
size, blk_size, units);
pr_warn("expected: '%s', got '%s'\n", exp_result, buf);
pr_warn("expected: '%s', got '%s'\n", exp, res);
}
static __init void __test_string_get_size(const u64 size, const u64 blk_size,
const char *exp_result10,
const char *exp_result2)
{
char buf10[string_get_size_maxbuf];
char buf2[string_get_size_maxbuf];
string_get_size(size, blk_size, STRING_UNITS_10, buf10, sizeof(buf10));
string_get_size(size, blk_size, STRING_UNITS_2, buf2, sizeof(buf2));
test_string_get_size_check("STRING_UNITS_10", exp_result10, buf10,
size, blk_size);
test_string_get_size_check("STRING_UNITS_2", exp_result2, buf2,
size, blk_size);
}
static __init void test_string_get_size(void)
{
test_string_get_size_one(16384, 512, STRING_UNITS_2, "8.00 MiB");
test_string_get_size_one(8192, 4096, STRING_UNITS_10, "32.7 MB");
test_string_get_size_one(1, 512, STRING_UNITS_10, "512 B");
/* small values */
test_string_get_size_one(0, 512, "0 B", "0 B");
test_string_get_size_one(1, 512, "512 B", "512 B");
test_string_get_size_one(1100, 1, "1.10 kB", "1.07 KiB");
/* normal values */
test_string_get_size_one(16384, 512, "8.39 MB", "8.00 MiB");
test_string_get_size_one(500118192, 512, "256 GB", "238 GiB");
test_string_get_size_one(8192, 4096, "33.6 MB", "32.0 MiB");
/* weird block sizes */
test_string_get_size_one(3000, 1900, "5.70 MB", "5.44 MiB");
/* huge values */
test_string_get_size_one(U64_MAX, 4096, "75.6 ZB", "64.0 ZiB");
test_string_get_size_one(4096, U64_MAX, "75.6 ZB", "64.0 ZiB");
}
static int __init test_string_helpers_init(void)
......
......@@ -138,9 +138,6 @@ static struct khugepaged_scan khugepaged_scan = {
.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
};
static DEFINE_SPINLOCK(split_queue_lock);
static LIST_HEAD(split_queue);
static unsigned long split_queue_len;
static struct shrinker deferred_split_shrinker;
static void set_recommended_min_free_kbytes(void)
......@@ -861,6 +858,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
return false;
entry = mk_pmd(zero_page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
if (pgtable)
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
atomic_long_inc(&mm->nr_ptes);
......@@ -1039,13 +1037,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
spinlock_t *dst_ptl, *src_ptl;
struct page *src_page;
pmd_t pmd;
pgtable_t pgtable;
pgtable_t pgtable = NULL;
int ret;
if (!vma_is_dax(vma)) {
ret = -ENOMEM;
pgtable = pte_alloc_one(dst_mm, addr);
if (unlikely(!pgtable))
goto out;
}
dst_ptl = pmd_lock(dst_mm, dst_pmd);
src_ptl = pmd_lockptr(src_mm, src_pmd);
......@@ -1076,7 +1076,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
goto out_unlock;
}
if (pmd_trans_huge(pmd)) {
if (!vma_is_dax(vma)) {
/* thp accounting separate from pmd_devmap accounting */
src_page = pmd_page(pmd);
VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
......@@ -3358,6 +3358,7 @@ int total_mapcount(struct page *page)
int split_huge_page_to_list(struct page *page, struct list_head *list)
{
struct page *head = compound_head(page);
struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
struct anon_vma *anon_vma;
int count, mapcount, ret;
bool mlocked;
......@@ -3401,19 +3402,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
lru_add_drain();
/* Prevent deferred_split_scan() touching ->_count */
spin_lock_irqsave(&split_queue_lock, flags);
spin_lock_irqsave(&pgdata->split_queue_lock, flags);
count = page_count(head);
mapcount = total_mapcount(head);
if (!mapcount && count == 1) {
if (!list_empty(page_deferred_list(head))) {
split_queue_len--;
pgdata->split_queue_len--;
list_del(page_deferred_list(head));
}
spin_unlock_irqrestore(&split_queue_lock, flags);
spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
__split_huge_page(page, list);
ret = 0;
} else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
spin_unlock_irqrestore(&split_queue_lock, flags);
spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
pr_alert("total_mapcount: %u, page_count(): %u\n",
mapcount, count);
if (PageTail(page))
......@@ -3421,7 +3422,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
dump_page(page, "total_mapcount(head) > 0");
BUG();
} else {
spin_unlock_irqrestore(&split_queue_lock, flags);
spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
unfreeze_page(anon_vma, head);
ret = -EBUSY;
}
......@@ -3436,64 +3437,65 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
void free_transhuge_page(struct page *page)
{
struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
unsigned long flags;
spin_lock_irqsave(&split_queue_lock, flags);
spin_lock_irqsave(&pgdata->split_queue_lock, flags);
if (!list_empty(page_deferred_list(page))) {
split_queue_len--;
pgdata->split_queue_len--;
list_del(page_deferred_list(page));
}
spin_unlock_irqrestore(&split_queue_lock, flags);
spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
free_compound_page(page);
}
void deferred_split_huge_page(struct page *page)
{
struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
unsigned long flags;
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
spin_lock_irqsave(&split_queue_lock, flags);
spin_lock_irqsave(&pgdata->split_queue_lock, flags);
if (list_empty(page_deferred_list(page))) {
list_add_tail(page_deferred_list(page), &split_queue);
split_queue_len++;
list_add_tail(page_deferred_list(page), &pgdata->split_queue);
pgdata->split_queue_len++;
}
spin_unlock_irqrestore(&split_queue_lock, flags);
spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
}
static unsigned long deferred_split_count(struct shrinker *shrink,
struct shrink_control *sc)
{
/*
* Split a page from split_queue will free up at least one page,
* at most HPAGE_PMD_NR - 1. We don't track exact number.
* Let's use HPAGE_PMD_NR / 2 as ballpark.
*/
return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
struct pglist_data *pgdata = NODE_DATA(sc->nid);
return ACCESS_ONCE(pgdata->split_queue_len);
}
static unsigned long deferred_split_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
struct pglist_data *pgdata = NODE_DATA(sc->nid);
unsigned long flags;
LIST_HEAD(list), *pos, *next;
struct page *page;
int split = 0;
spin_lock_irqsave(&split_queue_lock, flags);
list_splice_init(&split_queue, &list);
spin_lock_irqsave(&pgdata->split_queue_lock, flags);
/* Take pin on all head pages to avoid freeing them under us */
list_for_each_safe(pos, next, &list) {
page = list_entry((void *)pos, struct page, mapping);
page = compound_head(page);
/* race with put_compound_page() */
if (!get_page_unless_zero(page)) {
if (get_page_unless_zero(page)) {
list_move(page_deferred_list(page), &list);
} else {
/* We lost race with put_compound_page() */
list_del_init(page_deferred_list(page));
split_queue_len--;
pgdata->split_queue_len--;
}
if (!--sc->nr_to_scan)
break;
}
spin_unlock_irqrestore(&split_queue_lock, flags);
spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
list_for_each_safe(pos, next, &list) {
page = list_entry((void *)pos, struct page, mapping);
......@@ -3505,17 +3507,24 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
put_page(page);
}
spin_lock_irqsave(&split_queue_lock, flags);
list_splice_tail(&list, &split_queue);
spin_unlock_irqrestore(&split_queue_lock, flags);
spin_lock_irqsave(&pgdata->split_queue_lock, flags);
list_splice_tail(&list, &pgdata->split_queue);
spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
return split * HPAGE_PMD_NR / 2;
/*
* Stop shrinker if we didn't split any page, but the queue is empty.
* This can happen if pages were freed under us.
*/
if (!split && list_empty(&pgdata->split_queue))
return SHRINK_STOP;
return split;
}
static struct shrinker deferred_split_shrinker = {
.count_objects = deferred_split_count,
.scan_objects = deferred_split_scan,
.seeks = DEFAULT_SEEKS,
.flags = SHRINKER_NUMA_AWARE,
};
#ifdef CONFIG_DEBUG_FS
......
......@@ -216,6 +216,37 @@ static inline bool is_cow_mapping(vm_flags_t flags)
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
/*
* These three helpers classifies VMAs for virtual memory accounting.
*/
/*
* Executable code area - executable, not writable, not stack
*/
static inline bool is_exec_mapping(vm_flags_t flags)
{
return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
}
/*
* Stack area - atomatically grows in one direction
*
* VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
* do_mmap() forbids all other combinations.
*/
static inline bool is_stack_mapping(vm_flags_t flags)
{
return (flags & VM_STACK) == VM_STACK;
}
/*
* Data area - private, writable, not stack
*/
static inline bool is_data_mapping(vm_flags_t flags)
{
return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
}
/* mm/util.c */
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node *rb_parent);
......
......@@ -42,6 +42,7 @@
#include <linux/memory.h>
#include <linux/printk.h>
#include <linux/userfaultfd_k.h>
#include <linux/moduleparam.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
......@@ -69,6 +70,8 @@ const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
#endif
static bool ignore_rlimit_data = true;
core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
......@@ -2982,9 +2985,17 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
return false;
if ((flags & (VM_WRITE | VM_SHARED | (VM_STACK_FLAGS &
(VM_GROWSUP | VM_GROWSDOWN)))) == VM_WRITE)
return mm->data_vm + npages <= rlimit(RLIMIT_DATA);
if (is_data_mapping(flags) &&
mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
if (ignore_rlimit_data)
pr_warn_once("%s (%d): VmData %lu exceed data ulimit "
"%lu. Will be forbidden soon.\n",
current->comm, current->pid,
(mm->data_vm + npages) << PAGE_SHIFT,
rlimit(RLIMIT_DATA));
else
return false;
}
return true;
}
......@@ -2993,11 +3004,11 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
{
mm->total_vm += npages;
if ((flags & (VM_EXEC | VM_WRITE)) == VM_EXEC)
if (is_exec_mapping(flags))
mm->exec_vm += npages;
else if (flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN)))
else if (is_stack_mapping(flags))
mm->stack_vm += npages;
else if ((flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
else if (is_data_mapping(flags))
mm->data_vm += npages;
}
......
......@@ -5209,6 +5209,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
spin_lock_init(&pgdat->numabalancing_migrate_lock);
pgdat->numabalancing_migrate_nr_pages = 0;
pgdat->numabalancing_migrate_next_window = jiffies;
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
spin_lock_init(&pgdat->split_queue_lock);
INIT_LIST_HEAD(&pgdat->split_queue);
pgdat->split_queue_len = 0;
#endif
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
......
......@@ -230,36 +230,11 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
}
/* Check if the vma is being used as a stack by this task */
static int vm_is_stack_for_task(struct task_struct *t,
struct vm_area_struct *vma)
int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t)
{
return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
}
/*
* Check if the vma is being used as a stack.
* If is_group is non-zero, check in the entire thread group or else
* just check in the current task. Returns the task_struct of the task
* that the vma is stack for. Must be called under rcu_read_lock().
*/
struct task_struct *task_of_stack(struct task_struct *task,
struct vm_area_struct *vma, bool in_group)
{
if (vm_is_stack_for_task(task, vma))
return task;
if (in_group) {
struct task_struct *t;
for_each_thread(task, t) {
if (vm_is_stack_for_task(t, vma))
return t;
}
}
return NULL;
}
#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm)
{
......
......@@ -248,9 +248,8 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
if (tree) {
spin_lock(&vmpr->sr_lock);
vmpr->tree_scanned += scanned;
scanned = vmpr->tree_scanned += scanned;
vmpr->tree_reclaimed += reclaimed;
scanned = vmpr->scanned;
spin_unlock(&vmpr->sr_lock);
if (scanned < vmpressure_win)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册