提交 95211279 编写于 作者: L Linus Torvalds

Merge branch 'akpm' (Andrew's patch-bomb)

Merge first batch of patches from Andrew Morton:
 "A few misc things and all the MM queue"

* emailed from Andrew Morton <akpm@linux-foundation.org>: (92 commits)
  memcg: avoid THP split in task migration
  thp: add HPAGE_PMD_* definitions for !CONFIG_TRANSPARENT_HUGEPAGE
  memcg: clean up existing move charge code
  mm/memcontrol.c: remove unnecessary 'break' in mem_cgroup_read()
  mm/memcontrol.c: remove redundant BUG_ON() in mem_cgroup_usage_unregister_event()
  mm/memcontrol.c: s/stealed/stolen/
  memcg: fix performance of mem_cgroup_begin_update_page_stat()
  memcg: remove PCG_FILE_MAPPED
  memcg: use new logic for page stat accounting
  memcg: remove PCG_MOVE_LOCK flag from page_cgroup
  memcg: simplify move_account() check
  memcg: remove EXPORT_SYMBOL(mem_cgroup_update_page_stat)
  memcg: kill dead prev_priority stubs
  memcg: remove PCG_CACHE page_cgroup flag
  memcg: let css_get_next() rely upon rcu_read_lock()
  cgroup: revert ss_id_lock to spinlock
  idr: make idr_get_next() good for rcu_read_lock()
  memcg: remove unnecessary thp check in page stat accounting
  memcg: remove redundant returns
  memcg: enum lru_list lru
  ...
无相关合并请求
......@@ -290,7 +290,7 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7)
rsslim current limit in bytes on the rss
start_code address above which program text can run
end_code address below which program text can run
start_stack address of the start of the stack
start_stack address of the start of the main process stack
esp current value of ESP
eip current value of EIP
pending bitmap of pending signals
......@@ -325,7 +325,7 @@ address perms offset dev inode pathname
a7cb1000-a7cb2000 ---p 00000000 00:00 0
a7cb2000-a7eb2000 rw-p 00000000 00:00 0
a7eb2000-a7eb3000 ---p 00000000 00:00 0
a7eb3000-a7ed5000 rw-p 00000000 00:00 0
a7eb3000-a7ed5000 rw-p 00000000 00:00 0 [stack:1001]
a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6
a8008000-a800a000 r--p 00133000 03:00 4222 /lib/libc.so.6
a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/libc.so.6
......@@ -357,11 +357,39 @@ is not associated with a file:
[heap] = the heap of the program
[stack] = the stack of the main process
[stack:1001] = the stack of the thread with tid 1001
[vdso] = the "virtual dynamic shared object",
the kernel system call handler
or if empty, the mapping is anonymous.
The /proc/PID/task/TID/maps is a view of the virtual memory from the viewpoint
of the individual tasks of a process. In this file you will see a mapping marked
as [stack] if that task sees it as a stack. This is a key difference from the
content of /proc/PID/maps, where you will see all mappings that are being used
as stack by all of those tasks. Hence, for the example above, the task-level
map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this:
08048000-08049000 r-xp 00000000 03:00 8312 /opt/test
08049000-0804a000 rw-p 00001000 03:00 8312 /opt/test
0804a000-0806b000 rw-p 00000000 00:00 0 [heap]
a7cb1000-a7cb2000 ---p 00000000 00:00 0
a7cb2000-a7eb2000 rw-p 00000000 00:00 0
a7eb2000-a7eb3000 ---p 00000000 00:00 0
a7eb3000-a7ed5000 rw-p 00000000 00:00 0 [stack]
a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6
a8008000-a800a000 r--p 00133000 03:00 4222 /lib/libc.so.6
a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/libc.so.6
a800b000-a800e000 rw-p 00000000 00:00 0
a800e000-a8022000 r-xp 00000000 03:00 14462 /lib/libpthread.so.0
a8022000-a8023000 r--p 00013000 03:00 14462 /lib/libpthread.so.0
a8023000-a8024000 rw-p 00014000 03:00 14462 /lib/libpthread.so.0
a8024000-a8027000 rw-p 00000000 00:00 0
a8027000-a8043000 r-xp 00000000 03:00 8317 /lib/ld-linux.so.2
a8043000-a8044000 r--p 0001b000 03:00 8317 /lib/ld-linux.so.2
a8044000-a8045000 rw-p 0001c000 03:00 8317 /lib/ld-linux.so.2
aff35000-aff4a000 rw-p 00000000 00:00 0
ffffe000-fffff000 r-xp 00000000 00:00 0 [vdso]
The /proc/PID/smaps is an extension based on maps, showing the memory
consumption for each of the process's mappings. For each of mappings there
......
......@@ -2635,6 +2635,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
to facilitate early boot debugging.
See also Documentation/trace/events.txt
transparent_hugepage=
[KNL]
Format: [always|madvise|never]
Can be used to control the default behavior of the system
with respect to transparent hugepages.
See Documentation/vm/transhuge.txt for more details.
tsc= Disable clocksource stability checks for TSC.
Format: <string>
[x86] reliable: mark tsc clocksource as reliable, this
......
......@@ -98,6 +98,7 @@
#define KPF_HWPOISON 19
#define KPF_NOPAGE 20
#define KPF_KSM 21
#define KPF_THP 22
/* [32-] kernel hacking assistances */
#define KPF_RESERVED 32
......@@ -147,6 +148,7 @@ static const char *page_flag_names[] = {
[KPF_HWPOISON] = "X:hwpoison",
[KPF_NOPAGE] = "n:nopage",
[KPF_KSM] = "x:ksm",
[KPF_THP] = "t:thp",
[KPF_RESERVED] = "r:reserved",
[KPF_MLOCKED] = "m:mlocked",
......
......@@ -60,6 +60,7 @@ There are three components to pagemap:
19. HWPOISON
20. NOPAGE
21. KSM
22. THP
Short descriptions to the page flags:
......@@ -97,6 +98,9 @@ Short descriptions to the page flags:
21. KSM
identical memory pages dynamically shared between one or more processes
22. THP
contiguous pages which construct transparent hugepages
[IO related page flags]
1. ERROR IO error occurred
3. UPTODATE page has up-to-date data
......
......@@ -776,7 +776,6 @@ static inline int handle_signal32(unsigned long signr, struct k_sigaction *ka,
siginfo_t *info,
sigset_t *oldset, struct pt_regs *regs)
{
sigset_t blocked;
int err;
if (ka->sa.sa_flags & SA_SIGINFO)
......@@ -787,11 +786,7 @@ static inline int handle_signal32(unsigned long signr, struct k_sigaction *ka,
if (err)
return err;
sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
if (!(ka->sa.sa_flags & SA_NOMASK))
sigaddset(&blocked, signr);
set_current_blocked(&blocked);
block_sigmask(ka, signr);
tracehook_signal_handler(signr, info, ka, regs, 0);
return 0;
......
......@@ -465,7 +465,6 @@ static inline int
handle_signal(unsigned long signr, struct k_sigaction *ka,
siginfo_t *info, sigset_t *oldset, struct pt_regs *regs)
{
sigset_t blocked;
int err;
if (ka->sa.sa_flags & SA_SIGINFO)
......@@ -476,11 +475,7 @@ handle_signal(unsigned long signr, struct k_sigaction *ka,
if (err)
return err;
sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
if (!(ka->sa.sa_flags & SA_NOMASK))
sigaddset(&blocked, signr);
set_current_blocked(&blocked);
block_sigmask(ka, signr);
tracehook_signal_handler(signr, info, ka, regs, 0);
return 0;
......
......@@ -479,18 +479,14 @@ static inline int handle_signal(unsigned long signr, struct k_sigaction *ka,
siginfo_t *info,
sigset_t *oldset, struct pt_regs *regs)
{
sigset_t blocked;
int err;
err = setup_rt_frame(ka, regs, signr, oldset,
(ka->sa.sa_flags & SA_SIGINFO) ? info : NULL);
if (err)
return err;
sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
if (!(ka->sa.sa_flags & SA_NOMASK))
sigaddset(&blocked, signr);
set_current_blocked(&blocked);
block_sigmask(ka, signr);
tracehook_signal_handler(signr, info, ka, regs, 0);
return 0;
......
......@@ -195,7 +195,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
unsigned long addr = addr0;
unsigned long addr = addr0, start_addr;
/* requested length too big for entire address space */
if (len > TASK_SIZE)
......@@ -223,25 +223,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
mm->free_area_cache = mm->mmap_base;
}
try_again:
/* either no address requested or can't fit in requested address hole */
addr = mm->free_area_cache;
/* make sure it can fit in the remaining address space */
if (addr > len) {
unsigned long tmp_addr = align_addr(addr - len, filp,
ALIGN_TOPDOWN);
vma = find_vma(mm, tmp_addr);
if (!vma || tmp_addr + len <= vma->vm_start)
/* remember the address as a hint for next time */
return mm->free_area_cache = tmp_addr;
}
if (mm->mmap_base < len)
goto bottomup;
start_addr = addr = mm->free_area_cache;
addr = mm->mmap_base-len;
if (addr < len)
goto fail;
addr -= len;
do {
addr = align_addr(addr, filp, ALIGN_TOPDOWN);
......@@ -263,6 +252,17 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
addr = vma->vm_start-len;
} while (len < vma->vm_start);
fail:
/*
* if hint left us with no space for the requested
* mapping then try again:
*/
if (start_addr != mm->mmap_base) {
mm->free_area_cache = mm->mmap_base;
mm->cached_hole_size = 0;
goto try_again;
}
bottomup:
/*
* A failed mmap() very likely causes application failure,
......
......@@ -172,6 +172,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
spinlock_t *ptl;
int i;
down_write(&mm->mmap_sem);
pgd = pgd_offset(mm, 0xA0000);
if (pgd_none_or_clear_bad(pgd))
goto out;
......@@ -190,6 +191,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
}
pte_unmap_unlock(pte, ptl);
out:
up_write(&mm->mmap_sem);
flush_tlb();
}
......
......@@ -308,10 +308,11 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
{
struct hstate *h = hstate_file(file);
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev_vma;
unsigned long base = mm->mmap_base, addr = addr0;
struct vm_area_struct *vma;
unsigned long base = mm->mmap_base;
unsigned long addr = addr0;
unsigned long largest_hole = mm->cached_hole_size;
int first_time = 1;
unsigned long start_addr;
/* don't allow allocations above current base */
if (mm->free_area_cache > base)
......@@ -322,6 +323,8 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
mm->free_area_cache = base;
}
try_again:
start_addr = mm->free_area_cache;
/* make sure it can fit in the remaining address space */
if (mm->free_area_cache < len)
goto fail;
......@@ -337,22 +340,14 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
if (!vma)
return addr;
/*
* new region fits between prev_vma->vm_end and
* vma->vm_start, use it:
*/
prev_vma = vma->vm_prev;
if (addr + len <= vma->vm_start &&
(!prev_vma || (addr >= prev_vma->vm_end))) {
if (addr + len <= vma->vm_start) {
/* remember the address as a hint for next time */
mm->cached_hole_size = largest_hole;
return (mm->free_area_cache = addr);
} else {
} else if (mm->free_area_cache == vma->vm_end) {
/* pull free_area_cache down to the first hole */
if (mm->free_area_cache == vma->vm_end) {
mm->free_area_cache = vma->vm_start;
mm->cached_hole_size = largest_hole;
}
mm->free_area_cache = vma->vm_start;
mm->cached_hole_size = largest_hole;
}
/* remember the largest hole we saw so far */
......@@ -368,10 +363,9 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
* if hint left us with no space for the requested
* mapping then try again:
*/
if (first_time) {
if (start_addr != base) {
mm->free_area_cache = base;
largest_hole = 0;
first_time = 0;
goto try_again;
}
/*
......
......@@ -60,7 +60,7 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
eb->nid = nid;
if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
emu_nid_to_phys[nid] = pb->nid;
emu_nid_to_phys[nid] = nid;
pb->start += size;
if (pb->start >= pb->end) {
......
......@@ -260,10 +260,7 @@ asmlinkage long xtensa_rt_sigreturn(long a0, long a1, long a2, long a3,
goto badframe;
sigdelsetmask(&set, ~_BLOCKABLE);
spin_lock_irq(&current->sighand->siglock);
current->blocked = set;
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
set_current_blocked(&set);
if (restore_sigcontext(regs, frame))
goto badframe;
......@@ -336,8 +333,8 @@ gen_return_code(unsigned char *codemem)
}
static void setup_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
sigset_t *set, struct pt_regs *regs)
static int setup_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
sigset_t *set, struct pt_regs *regs)
{
struct rt_sigframe *frame;
int err = 0;
......@@ -422,12 +419,11 @@ static void setup_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
current->comm, current->pid, signal, frame, regs->pc);
#endif
return;
return 0;
give_sigsegv:
if (sig == SIGSEGV)
ka->sa.sa_handler = SIG_DFL;
force_sig(SIGSEGV, current);
force_sigsegv(sig, current);
return -EFAULT;
}
/*
......@@ -449,11 +445,8 @@ asmlinkage long xtensa_rt_sigsuspend(sigset_t __user *unewset,
return -EFAULT;
sigdelsetmask(&newset, ~_BLOCKABLE);
spin_lock_irq(&current->sighand->siglock);
saveset = current->blocked;
current->blocked = newset;
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
set_current_blocked(&newset);
regs->areg[2] = -EINTR;
while (1) {
......@@ -536,17 +529,11 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
/* Whee! Actually deliver the signal. */
/* Set up the stack frame */
setup_frame(signr, &ka, &info, oldset, regs);
if (ka.sa.sa_flags & SA_ONESHOT)
ka.sa.sa_handler = SIG_DFL;
ret = setup_frame(signr, &ka, &info, oldset, regs);
if (ret)
return ret;
spin_lock_irq(&current->sighand->siglock);
sigorsets(&current->blocked, &current->blocked, &ka.sa.sa_mask);
if (!(ka.sa.sa_flags & SA_NODEFER))
sigaddset(&current->blocked, signr);
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
block_sigmask(&ka, signr);
if (current->ptrace & PT_SINGLESTEP)
task_pt_regs(current)->icountlevel = 1;
......
......@@ -507,8 +507,7 @@ int intel_idle_cpu_init(int cpu)
int num_substates;
if (cstate > max_cstate) {
printk(PREFIX "max_cstate %d reached\n",
max_cstate);
printk(PREFIX "max_cstate %d reached\n", max_cstate);
break;
}
......@@ -524,8 +523,9 @@ int intel_idle_cpu_init(int cpu)
dev->states_usage[dev->state_count].driver_data =
(void *)get_driver_data(cstate);
dev->state_count += 1;
}
dev->state_count += 1;
}
dev->cpu = cpu;
if (cpuidle_register_device(dev)) {
......
......@@ -346,7 +346,7 @@ static struct sysrq_key_op sysrq_term_op = {
static void moom_callback(struct work_struct *ignored)
{
out_of_memory(node_zonelist(0, GFP_KERNEL), GFP_KERNEL, 0, NULL);
out_of_memory(node_zonelist(0, GFP_KERNEL), GFP_KERNEL, 0, NULL, true);
}
static DECLARE_WORK(moom_work, moom_callback);
......
......@@ -822,7 +822,7 @@ static int exec_mmap(struct mm_struct *mm)
/* Notify parent that we're no longer interested in the old VM */
tsk = current;
old_mm = current->mm;
sync_mm_rss(tsk, old_mm);
sync_mm_rss(old_mm);
mm_release(tsk, old_mm);
if (old_mm) {
......
......@@ -41,6 +41,25 @@ const struct file_operations hugetlbfs_file_operations;
static const struct inode_operations hugetlbfs_dir_inode_operations;
static const struct inode_operations hugetlbfs_inode_operations;
struct hugetlbfs_config {
uid_t uid;
gid_t gid;
umode_t mode;
long nr_blocks;
long nr_inodes;
struct hstate *hstate;
};
struct hugetlbfs_inode_info {
struct shared_policy policy;
struct inode vfs_inode;
};
static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
{
return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
}
static struct backing_dev_info hugetlbfs_backing_dev_info = {
.name = "hugetlbfs",
.ra_pages = 0, /* No readahead */
......@@ -154,10 +173,12 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
return addr;
}
start_addr = mm->free_area_cache;
if (len <= mm->cached_hole_size)
if (len > mm->cached_hole_size)
start_addr = mm->free_area_cache;
else {
start_addr = TASK_UNMAPPED_BASE;
mm->cached_hole_size = 0;
}
full_search:
addr = ALIGN(start_addr, huge_page_size(h));
......@@ -171,13 +192,18 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
*/
if (start_addr != TASK_UNMAPPED_BASE) {
start_addr = TASK_UNMAPPED_BASE;
mm->cached_hole_size = 0;
goto full_search;
}
return -ENOMEM;
}
if (!vma || addr + len <= vma->vm_start)
if (!vma || addr + len <= vma->vm_start) {
mm->free_area_cache = addr + len;
return addr;
}
if (addr + mm->cached_hole_size < vma->vm_start)
mm->cached_hole_size = vma->vm_start - addr;
addr = ALIGN(vma->vm_end, huge_page_size(h));
}
}
......@@ -238,17 +264,10 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
loff_t isize;
ssize_t retval = 0;
mutex_lock(&inode->i_mutex);
/* validate length */
if (len == 0)
goto out;
isize = i_size_read(inode);
if (!isize)
goto out;
end_index = (isize - 1) >> huge_page_shift(h);
for (;;) {
struct page *page;
unsigned long nr, ret;
......@@ -256,18 +275,21 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
/* nr is the maximum number of bytes to copy from this page */
nr = huge_page_size(h);
isize = i_size_read(inode);
if (!isize)
goto out;
end_index = (isize - 1) >> huge_page_shift(h);
if (index >= end_index) {
if (index > end_index)
goto out;
nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
if (nr <= offset) {
if (nr <= offset)
goto out;
}
}
nr = nr - offset;
/* Find the page */
page = find_get_page(mapping, index);
page = find_lock_page(mapping, index);
if (unlikely(page == NULL)) {
/*
* We have a HOLE, zero out the user-buffer for the
......@@ -279,17 +301,18 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
else
ra = 0;
} else {
unlock_page(page);
/*
* We have the page, copy it to user space buffer.
*/
ra = hugetlbfs_read_actor(page, offset, buf, len, nr);
ret = ra;
page_cache_release(page);
}
if (ra < 0) {
if (retval == 0)
retval = ra;
if (page)
page_cache_release(page);
goto out;
}
......@@ -299,16 +322,12 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
index += offset >> huge_page_shift(h);
offset &= ~huge_page_mask(h);
if (page)
page_cache_release(page);
/* short read or no more work */
if ((ret != nr) || (len == 0))
break;
}
out:
*ppos = ((loff_t)index << huge_page_shift(h)) + offset;
mutex_unlock(&inode->i_mutex);
return retval;
}
......@@ -607,9 +626,15 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
spin_lock(&sbinfo->stat_lock);
/* If no limits set, just report 0 for max/free/used
* blocks, like simple_statfs() */
if (sbinfo->max_blocks >= 0) {
buf->f_blocks = sbinfo->max_blocks;
buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
if (sbinfo->spool) {
long free_pages;
spin_lock(&sbinfo->spool->lock);
buf->f_blocks = sbinfo->spool->max_hpages;
free_pages = sbinfo->spool->max_hpages
- sbinfo->spool->used_hpages;
buf->f_bavail = buf->f_bfree = free_pages;
spin_unlock(&sbinfo->spool->lock);
buf->f_files = sbinfo->max_inodes;
buf->f_ffree = sbinfo->free_inodes;
}
......@@ -625,6 +650,10 @@ static void hugetlbfs_put_super(struct super_block *sb)
if (sbi) {
sb->s_fs_info = NULL;
if (sbi->spool)
hugepage_put_subpool(sbi->spool);
kfree(sbi);
}
}
......@@ -853,10 +882,14 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_fs_info = sbinfo;
sbinfo->hstate = config.hstate;
spin_lock_init(&sbinfo->stat_lock);
sbinfo->max_blocks = config.nr_blocks;
sbinfo->free_blocks = config.nr_blocks;
sbinfo->max_inodes = config.nr_inodes;
sbinfo->free_inodes = config.nr_inodes;
sbinfo->spool = NULL;
if (config.nr_blocks != -1) {
sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
if (!sbinfo->spool)
goto out_free;
}
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = huge_page_size(config.hstate);
sb->s_blocksize_bits = huge_page_shift(config.hstate);
......@@ -868,38 +901,12 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
goto out_free;
return 0;
out_free:
if (sbinfo->spool)
kfree(sbinfo->spool);
kfree(sbinfo);
return -ENOMEM;
}
int hugetlb_get_quota(struct address_space *mapping, long delta)
{
int ret = 0;
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
if (sbinfo->free_blocks > -1) {
spin_lock(&sbinfo->stat_lock);
if (sbinfo->free_blocks - delta >= 0)
sbinfo->free_blocks -= delta;
else
ret = -ENOMEM;
spin_unlock(&sbinfo->stat_lock);
}
return ret;
}
void hugetlb_put_quota(struct address_space *mapping, long delta)
{
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
if (sbinfo->free_blocks > -1) {
spin_lock(&sbinfo->stat_lock);
sbinfo->free_blocks += delta;
spin_unlock(&sbinfo->stat_lock);
}
}
static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
......@@ -919,8 +926,8 @@ static int can_do_hugetlb_shm(void)
return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
}
struct file *hugetlb_file_setup(const char *name, size_t size,
vm_flags_t acctflag,
struct file *hugetlb_file_setup(const char *name, unsigned long addr,
size_t size, vm_flags_t acctflag,
struct user_struct **user, int creat_flags)
{
int error = -ENOMEM;
......@@ -929,6 +936,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
struct path path;
struct dentry *root;
struct qstr quick_string;
struct hstate *hstate;
unsigned long num_pages;
*user = NULL;
if (!hugetlbfs_vfsmount)
......@@ -937,7 +946,11 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
*user = current_user();
if (user_shm_lock(size, *user)) {
printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n");
task_lock(current);
printk_once(KERN_WARNING
"%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
current->comm, current->pid);
task_unlock(current);
} else {
*user = NULL;
return ERR_PTR(-EPERM);
......@@ -958,10 +971,12 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
if (!inode)
goto out_dentry;
hstate = hstate_inode(inode);
size += addr & ~huge_page_mask(hstate);
num_pages = ALIGN(size, huge_page_size(hstate)) >>
huge_page_shift(hstate);
error = -ENOMEM;
if (hugetlb_reserve_pages(inode, 0,
size >> huge_page_shift(hstate_inode(inode)), NULL,
acctflag))
if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag))
goto out_inode;
d_instantiate(path.dentry, inode);
......@@ -997,6 +1012,7 @@ static int __init init_hugetlbfs_fs(void)
if (error)
return error;
error = -ENOMEM;
hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
sizeof(struct hugetlbfs_inode_info),
0, 0, init_once);
......@@ -1015,10 +1031,10 @@ static int __init init_hugetlbfs_fs(void)
}
error = PTR_ERR(vfsmount);
unregister_filesystem(&hugetlbfs_fs_type);
out:
if (error)
kmem_cache_destroy(hugetlbfs_inode_cachep);
kmem_cache_destroy(hugetlbfs_inode_cachep);
out2:
bdi_destroy(&hugetlbfs_backing_dev_info);
return error;
......
......@@ -1455,9 +1455,15 @@ unsigned int full_name_hash(const unsigned char *name, unsigned int len)
}
EXPORT_SYMBOL(full_name_hash);
#ifdef CONFIG_64BIT
#define ONEBYTES 0x0101010101010101ul
#define SLASHBYTES 0x2f2f2f2f2f2f2f2ful
#define HIGHBITS 0x8080808080808080ul
#else
#define ONEBYTES 0x01010101ul
#define SLASHBYTES 0x2f2f2f2ful
#define HIGHBITS 0x80808080ul
#endif
/* Return the high bit set in the first byte that is a zero */
static inline unsigned long has_zero(unsigned long a)
......
......@@ -2989,9 +2989,9 @@ static const struct pid_entry tgid_base_stuff[] = {
INF("cmdline", S_IRUGO, proc_pid_cmdline),
ONE("stat", S_IRUGO, proc_tgid_stat),
ONE("statm", S_IRUGO, proc_pid_statm),
REG("maps", S_IRUGO, proc_maps_operations),
REG("maps", S_IRUGO, proc_pid_maps_operations),
#ifdef CONFIG_NUMA
REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
#endif
REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
LNK("cwd", proc_cwd_link),
......@@ -3002,7 +3002,7 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("mountstats", S_IRUSR, proc_mountstats_operations),
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
REG("smaps", S_IRUGO, proc_smaps_operations),
REG("smaps", S_IRUGO, proc_pid_smaps_operations),
REG("pagemap", S_IRUGO, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
......@@ -3348,9 +3348,9 @@ static const struct pid_entry tid_base_stuff[] = {
INF("cmdline", S_IRUGO, proc_pid_cmdline),
ONE("stat", S_IRUGO, proc_tid_stat),
ONE("statm", S_IRUGO, proc_pid_statm),
REG("maps", S_IRUGO, proc_maps_operations),
REG("maps", S_IRUGO, proc_tid_maps_operations),
#ifdef CONFIG_NUMA
REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations),
#endif
REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
LNK("cwd", proc_cwd_link),
......@@ -3360,7 +3360,7 @@ static const struct pid_entry tid_base_stuff[] = {
REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
REG("smaps", S_IRUGO, proc_smaps_operations),
REG("smaps", S_IRUGO, proc_tid_smaps_operations),
REG("pagemap", S_IRUGO, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
......
......@@ -53,9 +53,12 @@ extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task);
extern loff_t mem_lseek(struct file *file, loff_t offset, int orig);
extern const struct file_operations proc_maps_operations;
extern const struct file_operations proc_numa_maps_operations;
extern const struct file_operations proc_smaps_operations;
extern const struct file_operations proc_pid_maps_operations;
extern const struct file_operations proc_tid_maps_operations;
extern const struct file_operations proc_pid_numa_maps_operations;
extern const struct file_operations proc_tid_numa_maps_operations;
extern const struct file_operations proc_pid_smaps_operations;
extern const struct file_operations proc_tid_smaps_operations;
extern const struct file_operations proc_clear_refs_operations;
extern const struct file_operations proc_pagemap_operations;
extern const struct file_operations proc_net_operations;
......
......@@ -115,6 +115,8 @@ u64 stable_page_flags(struct page *page)
u |= 1 << KPF_COMPOUND_TAIL;
if (PageHuge(page))
u |= 1 << KPF_HUGE;
else if (PageTransCompound(page))
u |= 1 << KPF_THP;
/*
* Caveats on high order pages: page->_count will only be set
......
......@@ -209,16 +209,20 @@ static int do_maps_open(struct inode *inode, struct file *file,
return ret;
}
static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
static void
show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
{
struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
struct proc_maps_private *priv = m->private;
struct task_struct *task = priv->task;
vm_flags_t flags = vma->vm_flags;
unsigned long ino = 0;
unsigned long long pgoff = 0;
unsigned long start, end;
dev_t dev = 0;
int len;
const char *name = NULL;
if (file) {
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
......@@ -252,36 +256,57 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
if (file) {
pad_len_spaces(m, len);
seq_path(m, &file->f_path, "\n");
} else {
const char *name = arch_vma_name(vma);
if (!name) {
if (mm) {
if (vma->vm_start <= mm->brk &&
vma->vm_end >= mm->start_brk) {
name = "[heap]";
} else if (vma->vm_start <= mm->start_stack &&
vma->vm_end >= mm->start_stack) {
name = "[stack]";
}
goto done;
}
name = arch_vma_name(vma);
if (!name) {
pid_t tid;
if (!mm) {
name = "[vdso]";
goto done;
}
if (vma->vm_start <= mm->brk &&
vma->vm_end >= mm->start_brk) {
name = "[heap]";
goto done;
}
tid = vm_is_stack(task, vma, is_pid);
if (tid != 0) {
/*
* Thread stack in /proc/PID/task/TID/maps or
* the main process stack.
*/
if (!is_pid || (vma->vm_start <= mm->start_stack &&
vma->vm_end >= mm->start_stack)) {
name = "[stack]";
} else {
name = "[vdso]";
/* Thread stack in /proc/PID/maps */
pad_len_spaces(m, len);
seq_printf(m, "[stack:%d]", tid);
}
}
if (name) {
pad_len_spaces(m, len);
seq_puts(m, name);
}
}
done:
if (name) {
pad_len_spaces(m, len);
seq_puts(m, name);
}
seq_putc(m, '\n');
}
static int show_map(struct seq_file *m, void *v)
static int show_map(struct seq_file *m, void *v, int is_pid)
{
struct vm_area_struct *vma = v;
struct proc_maps_private *priv = m->private;
struct task_struct *task = priv->task;
show_map_vma(m, vma);
show_map_vma(m, vma, is_pid);
if (m->count < m->size) /* vma is copied successfully */
m->version = (vma != get_gate_vma(task->mm))
......@@ -289,20 +314,49 @@ static int show_map(struct seq_file *m, void *v)
return 0;
}
static int show_pid_map(struct seq_file *m, void *v)
{
return show_map(m, v, 1);
}
static int show_tid_map(struct seq_file *m, void *v)
{
return show_map(m, v, 0);
}
static const struct seq_operations proc_pid_maps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
.show = show_map
.show = show_pid_map
};
static int maps_open(struct inode *inode, struct file *file)
static const struct seq_operations proc_tid_maps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
.show = show_tid_map
};
static int pid_maps_open(struct inode *inode, struct file *file)
{
return do_maps_open(inode, file, &proc_pid_maps_op);
}
const struct file_operations proc_maps_operations = {
.open = maps_open,
static int tid_maps_open(struct inode *inode, struct file *file)
{
return do_maps_open(inode, file, &proc_tid_maps_op);
}
const struct file_operations proc_pid_maps_operations = {
.open = pid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
};
const struct file_operations proc_tid_maps_operations = {
.open = tid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
......@@ -394,21 +448,15 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte;
spinlock_t *ptl;
spin_lock(&walk->mm->page_table_lock);
if (pmd_trans_huge(*pmd)) {
if (pmd_trans_splitting(*pmd)) {
spin_unlock(&walk->mm->page_table_lock);
wait_split_huge_page(vma->anon_vma, pmd);
} else {
smaps_pte_entry(*(pte_t *)pmd, addr,
HPAGE_PMD_SIZE, walk);
spin_unlock(&walk->mm->page_table_lock);
mss->anonymous_thp += HPAGE_PMD_SIZE;
return 0;
}
} else {
if (pmd_trans_huge_lock(pmd, vma) == 1) {
smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
spin_unlock(&walk->mm->page_table_lock);
mss->anonymous_thp += HPAGE_PMD_SIZE;
return 0;
}
if (pmd_trans_unstable(pmd))
return 0;
/*
* The mmap_sem held all the way back in m_start() is what
* keeps khugepaged out of here and from collapsing things
......@@ -422,7 +470,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
return 0;
}
static int show_smap(struct seq_file *m, void *v)
static int show_smap(struct seq_file *m, void *v, int is_pid)
{
struct proc_maps_private *priv = m->private;
struct task_struct *task = priv->task;
......@@ -440,7 +488,7 @@ static int show_smap(struct seq_file *m, void *v)
if (vma->vm_mm && !is_vm_hugetlb_page(vma))
walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
show_map_vma(m, vma);
show_map_vma(m, vma, is_pid);
seq_printf(m,
"Size: %8lu kB\n"
......@@ -479,20 +527,49 @@ static int show_smap(struct seq_file *m, void *v)
return 0;
}
static int show_pid_smap(struct seq_file *m, void *v)
{
return show_smap(m, v, 1);
}
static int show_tid_smap(struct seq_file *m, void *v)
{
return show_smap(m, v, 0);
}
static const struct seq_operations proc_pid_smaps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
.show = show_smap
.show = show_pid_smap
};
static const struct seq_operations proc_tid_smaps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
.show = show_tid_smap
};
static int smaps_open(struct inode *inode, struct file *file)
static int pid_smaps_open(struct inode *inode, struct file *file)
{
return do_maps_open(inode, file, &proc_pid_smaps_op);
}
const struct file_operations proc_smaps_operations = {
.open = smaps_open,
static int tid_smaps_open(struct inode *inode, struct file *file)
{
return do_maps_open(inode, file, &proc_tid_smaps_op);
}
const struct file_operations proc_pid_smaps_operations = {
.open = pid_smaps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
};
const struct file_operations proc_tid_smaps_operations = {
.open = tid_smaps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
......@@ -507,6 +584,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
struct page *page;
split_huge_page_pmd(walk->mm, pmd);
if (pmd_trans_unstable(pmd))
return 0;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE) {
......@@ -598,11 +677,18 @@ const struct file_operations proc_clear_refs_operations = {
.llseek = noop_llseek,
};
typedef struct {
u64 pme;
} pagemap_entry_t;
struct pagemapread {
int pos, len;
u64 *buffer;
pagemap_entry_t *buffer;
};
#define PAGEMAP_WALK_SIZE (PMD_SIZE)
#define PAGEMAP_WALK_MASK (PMD_MASK)
#define PM_ENTRY_BYTES sizeof(u64)
#define PM_STATUS_BITS 3
#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
......@@ -620,10 +706,15 @@ struct pagemapread {
#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT)
#define PM_END_OF_BUFFER 1
static int add_to_pagemap(unsigned long addr, u64 pfn,
static inline pagemap_entry_t make_pme(u64 val)
{
return (pagemap_entry_t) { .pme = val };
}
static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
struct pagemapread *pm)
{
pm->buffer[pm->pos++] = pfn;
pm->buffer[pm->pos++] = *pme;
if (pm->pos >= pm->len)
return PM_END_OF_BUFFER;
return 0;
......@@ -635,8 +726,10 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
struct pagemapread *pm = walk->private;
unsigned long addr;
int err = 0;
pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
for (addr = start; addr < end; addr += PAGE_SIZE) {
err = add_to_pagemap(addr, PM_NOT_PRESENT, pm);
err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
}
......@@ -649,17 +742,35 @@ static u64 swap_pte_to_pagemap_entry(pte_t pte)
return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);
}
static u64 pte_to_pagemap_entry(pte_t pte)
static void pte_to_pagemap_entry(pagemap_entry_t *pme, pte_t pte)
{
u64 pme = 0;
if (is_swap_pte(pte))
pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte))
| PM_PSHIFT(PAGE_SHIFT) | PM_SWAP;
*pme = make_pme(PM_PFRAME(swap_pte_to_pagemap_entry(pte))
| PM_PSHIFT(PAGE_SHIFT) | PM_SWAP);
else if (pte_present(pte))
pme = PM_PFRAME(pte_pfn(pte))
| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
return pme;
*pme = make_pme(PM_PFRAME(pte_pfn(pte))
| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
pmd_t pmd, int offset)
{
/*
* Currently pmd for thp is always present because thp can not be
* swapped-out, migrated, or HWPOISONed (split in such cases instead.)
* This if-check is just to prepare for future implementation.
*/
if (pmd_present(pmd))
*pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
}
#else
static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
pmd_t pmd, int offset)
{
}
#endif
static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
......@@ -668,13 +779,30 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct pagemapread *pm = walk->private;
pte_t *pte;
int err = 0;
pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
split_huge_page_pmd(walk->mm, pmd);
if (pmd_trans_unstable(pmd))
return 0;
/* find the first VMA at or above 'addr' */
vma = find_vma(walk->mm, addr);
spin_lock(&walk->mm->page_table_lock);
if (pmd_trans_huge_lock(pmd, vma) == 1) {
for (; addr != end; addr += PAGE_SIZE) {
unsigned long offset;
offset = (addr & ~PAGEMAP_WALK_MASK) >>
PAGE_SHIFT;
thp_pmd_to_pagemap_entry(&pme, *pmd, offset);
err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
}
spin_unlock(&walk->mm->page_table_lock);
return err;
}
for (; addr != end; addr += PAGE_SIZE) {
u64 pfn = PM_NOT_PRESENT;
/* check to see if we've left 'vma' behind
* and need a new, higher one */
......@@ -686,11 +814,11 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (vma && (vma->vm_start <= addr) &&
!is_vm_hugetlb_page(vma)) {
pte = pte_offset_map(pmd, addr);
pfn = pte_to_pagemap_entry(*pte);
pte_to_pagemap_entry(&pme, *pte);
/* unmap before userspace copy */
pte_unmap(pte);
}
err = add_to_pagemap(addr, pfn, pm);
err = add_to_pagemap(addr, &pme, pm);
if (err)
return err;
}
......@@ -701,13 +829,12 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
}
#ifdef CONFIG_HUGETLB_PAGE
static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme,
pte_t pte, int offset)
{
u64 pme = 0;
if (pte_present(pte))
pme = PM_PFRAME(pte_pfn(pte) + offset)
| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
return pme;
*pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)
| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
}
/* This function walks within one hugetlb entry in the single call */
......@@ -717,12 +844,12 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
{
struct pagemapread *pm = walk->private;
int err = 0;
u64 pfn;
pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
for (; addr != end; addr += PAGE_SIZE) {
int offset = (addr & ~hmask) >> PAGE_SHIFT;
pfn = huge_pte_to_pagemap_entry(*pte, offset);
err = add_to_pagemap(addr, pfn, pm);
huge_pte_to_pagemap_entry(&pme, *pte, offset);
err = add_to_pagemap(addr, &pme, pm);
if (err)
return err;
}
......@@ -757,8 +884,6 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
* determine which areas of memory are actually mapped and llseek to
* skip over unmapped regions.
*/
#define PAGEMAP_WALK_SIZE (PMD_SIZE)
#define PAGEMAP_WALK_MASK (PMD_MASK)
static ssize_t pagemap_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
......@@ -941,26 +1066,21 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
pte_t *pte;
md = walk->private;
spin_lock(&walk->mm->page_table_lock);
if (pmd_trans_huge(*pmd)) {
if (pmd_trans_splitting(*pmd)) {
spin_unlock(&walk->mm->page_table_lock);
wait_split_huge_page(md->vma->anon_vma, pmd);
} else {
pte_t huge_pte = *(pte_t *)pmd;
struct page *page;
page = can_gather_numa_stats(huge_pte, md->vma, addr);
if (page)
gather_stats(page, md, pte_dirty(huge_pte),
HPAGE_PMD_SIZE/PAGE_SIZE);
spin_unlock(&walk->mm->page_table_lock);
return 0;
}
} else {
if (pmd_trans_huge_lock(pmd, md->vma) == 1) {
pte_t huge_pte = *(pte_t *)pmd;
struct page *page;
page = can_gather_numa_stats(huge_pte, md->vma, addr);
if (page)
gather_stats(page, md, pte_dirty(huge_pte),
HPAGE_PMD_SIZE/PAGE_SIZE);
spin_unlock(&walk->mm->page_table_lock);
return 0;
}
if (pmd_trans_unstable(pmd))
return 0;
orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
do {
struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
......@@ -1002,7 +1122,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
/*
* Display pages allocated per node and memory policy via /proc.
*/
static int show_numa_map(struct seq_file *m, void *v)
static int show_numa_map(struct seq_file *m, void *v, int is_pid)
{
struct numa_maps_private *numa_priv = m->private;
struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
......@@ -1039,9 +1159,19 @@ static int show_numa_map(struct seq_file *m, void *v)
seq_path(m, &file->f_path, "\n\t= ");
} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
seq_printf(m, " heap");
} else if (vma->vm_start <= mm->start_stack &&
vma->vm_end >= mm->start_stack) {
seq_printf(m, " stack");
} else {
pid_t tid = vm_is_stack(proc_priv->task, vma, is_pid);
if (tid != 0) {
/*
* Thread stack in /proc/PID/task/TID/maps or
* the main process stack.
*/
if (!is_pid || (vma->vm_start <= mm->start_stack &&
vma->vm_end >= mm->start_stack))
seq_printf(m, " stack");
else
seq_printf(m, " stack:%d", tid);
}
}
if (is_vm_hugetlb_page(vma))
......@@ -1084,21 +1214,39 @@ static int show_numa_map(struct seq_file *m, void *v)
return 0;
}
static int show_pid_numa_map(struct seq_file *m, void *v)
{
return show_numa_map(m, v, 1);
}
static int show_tid_numa_map(struct seq_file *m, void *v)
{
return show_numa_map(m, v, 0);
}
static const struct seq_operations proc_pid_numa_maps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
.show = show_numa_map,
.start = m_start,
.next = m_next,
.stop = m_stop,
.show = show_pid_numa_map,
};
static const struct seq_operations proc_tid_numa_maps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
.show = show_tid_numa_map,
};
static int numa_maps_open(struct inode *inode, struct file *file)
static int numa_maps_open(struct inode *inode, struct file *file,
const struct seq_operations *ops)
{
struct numa_maps_private *priv;
int ret = -ENOMEM;
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
if (priv) {
priv->proc_maps.pid = proc_pid(inode);
ret = seq_open(file, &proc_pid_numa_maps_op);
ret = seq_open(file, ops);
if (!ret) {
struct seq_file *m = file->private_data;
m->private = priv;
......@@ -1109,8 +1257,25 @@ static int numa_maps_open(struct inode *inode, struct file *file)
return ret;
}
const struct file_operations proc_numa_maps_operations = {
.open = numa_maps_open,
static int pid_numa_maps_open(struct inode *inode, struct file *file)
{
return numa_maps_open(inode, file, &proc_pid_numa_maps_op);
}
static int tid_numa_maps_open(struct inode *inode, struct file *file)
{
return numa_maps_open(inode, file, &proc_tid_numa_maps_op);
}
const struct file_operations proc_pid_numa_maps_operations = {
.open = pid_numa_maps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
};
const struct file_operations proc_tid_numa_maps_operations = {
.open = tid_numa_maps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
......
......@@ -134,9 +134,11 @@ static void pad_len_spaces(struct seq_file *m, int len)
/*
* display a single VMA to a sequenced file
*/
static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
int is_pid)
{
struct mm_struct *mm = vma->vm_mm;
struct proc_maps_private *priv = m->private;
unsigned long ino = 0;
struct file *file;
dev_t dev = 0;
......@@ -168,10 +170,19 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
pad_len_spaces(m, len);
seq_path(m, &file->f_path, "");
} else if (mm) {
if (vma->vm_start <= mm->start_stack &&
vma->vm_end >= mm->start_stack) {
pid_t tid = vm_is_stack(priv->task, vma, is_pid);
if (tid != 0) {
pad_len_spaces(m, len);
seq_puts(m, "[stack]");
/*
* Thread stack in /proc/PID/task/TID/maps or
* the main process stack.
*/
if (!is_pid || (vma->vm_start <= mm->start_stack &&
vma->vm_end >= mm->start_stack))
seq_printf(m, "[stack]");
else
seq_printf(m, "[stack:%d]", tid);
}
}
......@@ -182,11 +193,22 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
/*
* display mapping lines for a particular process's /proc/pid/maps
*/
static int show_map(struct seq_file *m, void *_p)
static int show_map(struct seq_file *m, void *_p, int is_pid)
{
struct rb_node *p = _p;
return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb),
is_pid);
}
static int show_pid_map(struct seq_file *m, void *_p)
{
return show_map(m, _p, 1);
}
static int show_tid_map(struct seq_file *m, void *_p)
{
return show_map(m, _p, 0);
}
static void *m_start(struct seq_file *m, loff_t *pos)
......@@ -240,10 +262,18 @@ static const struct seq_operations proc_pid_maps_ops = {
.start = m_start,
.next = m_next,
.stop = m_stop,
.show = show_map
.show = show_pid_map
};
static const struct seq_operations proc_tid_maps_ops = {
.start = m_start,
.next = m_next,
.stop = m_stop,
.show = show_tid_map
};
static int maps_open(struct inode *inode, struct file *file)
static int maps_open(struct inode *inode, struct file *file,
const struct seq_operations *ops)
{
struct proc_maps_private *priv;
int ret = -ENOMEM;
......@@ -251,7 +281,7 @@ static int maps_open(struct inode *inode, struct file *file)
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
if (priv) {
priv->pid = proc_pid(inode);
ret = seq_open(file, &proc_pid_maps_ops);
ret = seq_open(file, ops);
if (!ret) {
struct seq_file *m = file->private_data;
m->private = priv;
......@@ -262,8 +292,25 @@ static int maps_open(struct inode *inode, struct file *file)
return ret;
}
const struct file_operations proc_maps_operations = {
.open = maps_open,
static int pid_maps_open(struct inode *inode, struct file *file)
{
return maps_open(inode, file, &proc_pid_maps_ops);
}
static int tid_maps_open(struct inode *inode, struct file *file)
{
return maps_open(inode, file, &proc_tid_maps_ops);
}
const struct file_operations proc_pid_maps_operations = {
.open = pid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
};
const struct file_operations proc_tid_maps_operations = {
.open = tid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
......
......@@ -140,9 +140,21 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
mutex_lock(&m->lock);
/*
* seq_file->op->..m_start/m_stop/m_next may do special actions
* or optimisations based on the file->f_version, so we want to
* pass the file->f_version to those methods.
*
* seq_file->version is just copy of f_version, and seq_file
* methods can treat it simply as file version.
* It is copied in first and copied out after all operations.
* It is convenient to have it as part of structure to avoid the
* need of passing another argument to all the seq_file methods.
*/
m->version = file->f_version;
/* Don't assume *ppos is where we left it */
if (unlikely(*ppos != m->read_pos)) {
m->read_pos = *ppos;
while ((err = traverse(m, *ppos)) == -EAGAIN)
;
if (err) {
......@@ -152,21 +164,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
m->index = 0;
m->count = 0;
goto Done;
} else {
m->read_pos = *ppos;
}
}
/*
* seq_file->op->..m_start/m_stop/m_next may do special actions
* or optimisations based on the file->f_version, so we want to
* pass the file->f_version to those methods.
*
* seq_file->version is just copy of f_version, and seq_file
* methods can treat it simply as file version.
* It is copied in first and copied out after all operations.
* It is convenient to have it as part of structure to avoid the
* need of passing another argument to all the seq_file methods.
*/
m->version = file->f_version;
/* grab buffer if we didn't have one */
if (!m->buf) {
m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
......
......@@ -425,6 +425,8 @@ extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
unsigned long size);
#endif
#ifdef CONFIG_MMU
#ifndef CONFIG_TRANSPARENT_HUGEPAGE
static inline int pmd_trans_huge(pmd_t pmd)
{
......@@ -441,7 +443,66 @@ static inline int pmd_write(pmd_t pmd)
return 0;
}
#endif /* __HAVE_ARCH_PMD_WRITE */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
* This function is meant to be used by sites walking pagetables with
* the mmap_sem hold in read mode to protect against MADV_DONTNEED and
* transhuge page faults. MADV_DONTNEED can convert a transhuge pmd
* into a null pmd and the transhuge page fault can convert a null pmd
* into an hugepmd or into a regular pmd (if the hugepage allocation
* fails). While holding the mmap_sem in read mode the pmd becomes
* stable and stops changing under us only if it's not null and not a
* transhuge pmd. When those races occurs and this function makes a
* difference vs the standard pmd_none_or_clear_bad, the result is
* undefined so behaving like if the pmd was none is safe (because it
* can return none anyway). The compiler level barrier() is critically
* important to compute the two checks atomically on the same pmdval.
*/
static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
{
/* depend on compiler for an atomic pmd read */
pmd_t pmdval = *pmd;
/*
* The barrier will stabilize the pmdval in a register or on
* the stack so that it will stop changing under the code.
*/
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
barrier();
#endif
if (pmd_none(pmdval))
return 1;
if (unlikely(pmd_bad(pmdval))) {
if (!pmd_trans_huge(pmdval))
pmd_clear_bad(pmd);
return 1;
}
return 0;
}
/*
* This is a noop if Transparent Hugepage Support is not built into
* the kernel. Otherwise it is equivalent to
* pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in
* places that already verified the pmd is not none and they want to
* walk ptes while holding the mmap sem in read mode (write mode don't
* need this). If THP is not enabled, the pmd can't go away under the
* code even if MADV_DONTNEED runs, but if THP is enabled we need to
* run a pmd_trans_unstable before walking the ptes after
* split_huge_page_pmd returns (because it may have run when the pmd
* become null, but then a page fault can map in a THP and not a
* regular page).
*/
static inline int pmd_trans_unstable(pmd_t *pmd)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
return pmd_none_or_trans_huge_or_clear_bad(pmd);
#else
return 0;
#endif
}
#endif /* CONFIG_MMU */
#endif /* !__ASSEMBLY__ */
......
......@@ -498,7 +498,7 @@ struct cgroup_subsys {
struct list_head sibling;
/* used when use_id == true */
struct idr idr;
rwlock_t id_lock;
spinlock_t id_lock;
/* should be defined only by modular subsystems */
struct module *module;
......
......@@ -23,6 +23,7 @@ extern int fragmentation_index(struct zone *zone, unsigned int order);
extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
int order, gfp_t gfp_mask, nodemask_t *mask,
bool sync);
extern int compact_pgdat(pg_data_t *pgdat, int order);
extern unsigned long compaction_suitable(struct zone *zone, int order);
/* Do not skip compaction more than 64 times */
......@@ -33,20 +34,26 @@ extern unsigned long compaction_suitable(struct zone *zone, int order);
* allocation success. 1 << compact_defer_limit compactions are skipped up
* to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
*/
static inline void defer_compaction(struct zone *zone)
static inline void defer_compaction(struct zone *zone, int order)
{
zone->compact_considered = 0;
zone->compact_defer_shift++;
if (order < zone->compact_order_failed)
zone->compact_order_failed = order;
if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
}
/* Returns true if compaction should be skipped this time */
static inline bool compaction_deferred(struct zone *zone)
static inline bool compaction_deferred(struct zone *zone, int order)
{
unsigned long defer_limit = 1UL << zone->compact_defer_shift;
if (order < zone->compact_order_failed)
return false;
/* Avoid possible overflow */
if (++zone->compact_considered > defer_limit)
zone->compact_considered = defer_limit;
......@@ -62,16 +69,21 @@ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
return COMPACT_CONTINUE;
}
static inline int compact_pgdat(pg_data_t *pgdat, int order)
{
return COMPACT_CONTINUE;
}
static inline unsigned long compaction_suitable(struct zone *zone, int order)
{
return COMPACT_SKIPPED;
}
static inline void defer_compaction(struct zone *zone)
static inline void defer_compaction(struct zone *zone, int order)
{
}
static inline bool compaction_deferred(struct zone *zone)
static inline bool compaction_deferred(struct zone *zone, int order)
{
return 1;
}
......
......@@ -89,42 +89,33 @@ extern void rebuild_sched_domains(void);
extern void cpuset_print_task_mems_allowed(struct task_struct *p);
/*
* reading current mems_allowed and mempolicy in the fastpath must protected
* by get_mems_allowed()
* get_mems_allowed is required when making decisions involving mems_allowed
* such as during page allocation. mems_allowed can be updated in parallel
* and depending on the new value an operation can fail potentially causing
* process failure. A retry loop with get_mems_allowed and put_mems_allowed
* prevents these artificial failures.
*/
static inline void get_mems_allowed(void)
static inline unsigned int get_mems_allowed(void)
{
current->mems_allowed_change_disable++;
/*
* ensure that reading mems_allowed and mempolicy happens after the
* update of ->mems_allowed_change_disable.
*
* the write-side task finds ->mems_allowed_change_disable is not 0,
* and knows the read-side task is reading mems_allowed or mempolicy,
* so it will clear old bits lazily.
*/
smp_mb();
return read_seqcount_begin(&current->mems_allowed_seq);
}
static inline void put_mems_allowed(void)
/*
* If this returns false, the operation that took place after get_mems_allowed
* may have failed. It is up to the caller to retry the operation if
* appropriate.
*/
static inline bool put_mems_allowed(unsigned int seq)
{
/*
* ensure that reading mems_allowed and mempolicy before reducing
* mems_allowed_change_disable.
*
* the write-side task will know that the read-side task is still
* reading mems_allowed or mempolicy, don't clears old bits in the
* nodemask.
*/
smp_mb();
--ACCESS_ONCE(current->mems_allowed_change_disable);
return !read_seqcount_retry(&current->mems_allowed_seq, seq);
}
static inline void set_mems_allowed(nodemask_t nodemask)
{
task_lock(current);
write_seqcount_begin(&current->mems_allowed_seq);
current->mems_allowed = nodemask;
write_seqcount_end(&current->mems_allowed_seq);
task_unlock(current);
}
......@@ -234,12 +225,14 @@ static inline void set_mems_allowed(nodemask_t nodemask)
{
}
static inline void get_mems_allowed(void)
static inline unsigned int get_mems_allowed(void)
{
return 0;
}
static inline void put_mems_allowed(void)
static inline bool put_mems_allowed(unsigned int seq)
{
return true;
}
#endif /* !CONFIG_CPUSETS */
......
......@@ -51,6 +51,9 @@ extern pmd_t *page_check_address_pmd(struct page *page,
unsigned long address,
enum page_check_address_pmd_flag flag);
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define HPAGE_PMD_SHIFT HPAGE_SHIFT
#define HPAGE_PMD_MASK HPAGE_MASK
......@@ -102,8 +105,6 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
BUG_ON(pmd_trans_splitting(*____pmd) || \
pmd_trans_huge(*____pmd)); \
} while (0)
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
#if HPAGE_PMD_ORDER > MAX_ORDER
#error "hugepages can't be allocated by the buddy allocator"
#endif
......@@ -113,6 +114,18 @@ extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
long adjust_next);
extern int __pmd_trans_huge_lock(pmd_t *pmd,
struct vm_area_struct *vma);
/* mmap_sem must be held on entry */
static inline int pmd_trans_huge_lock(pmd_t *pmd,
struct vm_area_struct *vma)
{
VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
if (pmd_trans_huge(*pmd))
return __pmd_trans_huge_lock(pmd, vma);
else
return 0;
}
static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
......@@ -146,9 +159,9 @@ static inline struct page *compound_trans_head(struct page *page)
return page;
}
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
#define HPAGE_PMD_SHIFT ({ BUG(); 0; })
#define HPAGE_PMD_MASK ({ BUG(); 0; })
#define HPAGE_PMD_SIZE ({ BUG(); 0; })
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
#define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; })
#define hpage_nr_pages(x) 1
......@@ -176,6 +189,11 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
long adjust_next)
{
}
static inline int pmd_trans_huge_lock(pmd_t *pmd,
struct vm_area_struct *vma)
{
return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* _LINUX_HUGE_MM_H */
......@@ -14,6 +14,15 @@ struct user_struct;
#include <linux/shm.h>
#include <asm/tlbflush.h>
struct hugepage_subpool {
spinlock_t lock;
long count;
long max_hpages, used_hpages;
};
struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
void hugepage_put_subpool(struct hugepage_subpool *spool);
int PageHuge(struct page *page);
void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
......@@ -128,35 +137,14 @@ enum {
};
#ifdef CONFIG_HUGETLBFS
struct hugetlbfs_config {
uid_t uid;
gid_t gid;
umode_t mode;
long nr_blocks;
long nr_inodes;
struct hstate *hstate;
};
struct hugetlbfs_sb_info {
long max_blocks; /* blocks allowed */
long free_blocks; /* blocks free */
long max_inodes; /* inodes allowed */
long free_inodes; /* inodes free */
spinlock_t stat_lock;
struct hstate *hstate;
struct hugepage_subpool *spool;
};
struct hugetlbfs_inode_info {
struct shared_policy policy;
struct inode vfs_inode;
};
static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
{
return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
}
static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
{
return sb->s_fs_info;
......@@ -164,10 +152,9 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
extern const struct file_operations hugetlbfs_file_operations;
extern const struct vm_operations_struct hugetlb_vm_ops;
struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
struct file *hugetlb_file_setup(const char *name, unsigned long addr,
size_t size, vm_flags_t acct,
struct user_struct **user, int creat_flags);
int hugetlb_get_quota(struct address_space *mapping, long delta);
void hugetlb_put_quota(struct address_space *mapping, long delta);
static inline int is_file_hugepages(struct file *file)
{
......@@ -179,15 +166,11 @@ static inline int is_file_hugepages(struct file *file)
return 0;
}
static inline void set_file_hugepages(struct file *file)
{
file->f_op = &hugetlbfs_file_operations;
}
#else /* !CONFIG_HUGETLBFS */
#define is_file_hugepages(file) 0
#define set_file_hugepages(file) BUG()
static inline struct file *hugetlb_file_setup(const char *name, size_t size,
static inline struct file *
hugetlb_file_setup(const char *name, unsigned long addr, size_t size,
vm_flags_t acctflag, struct user_struct **user, int creat_flags)
{
return ERR_PTR(-ENOSYS);
......
......@@ -29,6 +29,13 @@ extern struct fs_struct init_fs;
#define INIT_GROUP_RWSEM(sig)
#endif
#ifdef CONFIG_CPUSETS
#define INIT_CPUSET_SEQ \
.mems_allowed_seq = SEQCNT_ZERO,
#else
#define INIT_CPUSET_SEQ
#endif
#define INIT_SIGNALS(sig) { \
.nr_threads = 1, \
.wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\
......@@ -192,6 +199,7 @@ extern struct cred init_cred;
INIT_FTRACE_GRAPH \
INIT_TRACE_RECURSION \
INIT_TASK_RCU_PREEMPT(tsk) \
INIT_CPUSET_SEQ \
}
......
......@@ -30,6 +30,7 @@
#define KPF_NOPAGE 20
#define KPF_KSM 21
#define KPF_THP 22
/* kernel hacking assistances
* WARNING: subject to change, never rely on them!
......
......@@ -77,7 +77,8 @@ extern void mem_cgroup_uncharge_end(void);
extern void mem_cgroup_uncharge_page(struct page *page);
extern void mem_cgroup_uncharge_cache_page(struct page *page);
extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask);
extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order);
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg);
extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
......@@ -140,6 +141,34 @@ static inline bool mem_cgroup_disabled(void)
return false;
}
void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked,
unsigned long *flags);
extern atomic_t memcg_moving;
static inline void mem_cgroup_begin_update_page_stat(struct page *page,
bool *locked, unsigned long *flags)
{
if (mem_cgroup_disabled())
return;
rcu_read_lock();
*locked = false;
if (atomic_read(&memcg_moving))
__mem_cgroup_begin_update_page_stat(page, locked, flags);
}
void __mem_cgroup_end_update_page_stat(struct page *page,
unsigned long *flags);
static inline void mem_cgroup_end_update_page_stat(struct page *page,
bool *locked, unsigned long *flags)
{
if (mem_cgroup_disabled())
return;
if (*locked)
__mem_cgroup_end_update_page_stat(page, flags);
rcu_read_unlock();
}
void mem_cgroup_update_page_stat(struct page *page,
enum mem_cgroup_page_stat_item idx,
int val);
......@@ -298,21 +327,6 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
{
}
static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *memcg)
{
return 0;
}
static inline void mem_cgroup_note_reclaim_priority(struct mem_cgroup *memcg,
int priority)
{
}
static inline void mem_cgroup_record_reclaim_priority(struct mem_cgroup *memcg,
int priority)
{
}
static inline bool mem_cgroup_disabled(void)
{
return true;
......@@ -355,6 +369,16 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
}
static inline void mem_cgroup_begin_update_page_stat(struct page *page,
bool *locked, unsigned long *flags)
{
}
static inline void mem_cgroup_end_update_page_stat(struct page *page,
bool *locked, unsigned long *flags)
{
}
static inline void mem_cgroup_inc_page_stat(struct page *page,
enum mem_cgroup_page_stat_item idx)
{
......@@ -391,7 +415,7 @@ static inline void mem_cgroup_replace_page_cache(struct page *oldpage,
struct page *newpage)
{
}
#endif /* CONFIG_CGROUP_MEM_CONT */
#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
#if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM)
static inline bool
......
......@@ -8,7 +8,6 @@
typedef struct page *new_page_t(struct page *, unsigned long private, int **);
#ifdef CONFIG_MIGRATION
#define PAGE_MIGRATION 1
extern void putback_lru_pages(struct list_head *l);
extern int migrate_page(struct address_space *,
......@@ -32,7 +31,6 @@ extern void migrate_page_copy(struct page *newpage, struct page *page);
extern int migrate_huge_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page);
#else
#define PAGE_MIGRATION 0
static inline void putback_lru_pages(struct list_head *l) {}
static inline int migrate_pages(struct list_head *l, new_page_t x,
......
......@@ -1040,6 +1040,9 @@ static inline int stack_guard_page_end(struct vm_area_struct *vma,
!vma_growsup(vma->vm_next, addr);
}
extern pid_t
vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group);
extern unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len);
......@@ -1058,19 +1061,20 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
/*
* per-process(per-mm_struct) statistics.
*/
static inline void set_mm_counter(struct mm_struct *mm, int member, long value)
{
atomic_long_set(&mm->rss_stat.count[member], value);
}
#if defined(SPLIT_RSS_COUNTING)
unsigned long get_mm_counter(struct mm_struct *mm, int member);
#else
static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
{
return atomic_long_read(&mm->rss_stat.count[member]);
}
long val = atomic_long_read(&mm->rss_stat.count[member]);
#ifdef SPLIT_RSS_COUNTING
/*
* counter is updated in asynchronous manner and may go to minus.
* But it's never be expected number for users.
*/
if (val < 0)
val = 0;
#endif
return (unsigned long)val;
}
static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
{
......@@ -1127,9 +1131,9 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
}
#if defined(SPLIT_RSS_COUNTING)
void sync_mm_rss(struct task_struct *task, struct mm_struct *mm);
void sync_mm_rss(struct mm_struct *mm);
#else
static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
static inline void sync_mm_rss(struct mm_struct *mm)
{
}
#endif
......@@ -1291,8 +1295,6 @@ extern void get_pfn_range_for_nid(unsigned int nid,
extern unsigned long find_min_pfn_with_active_regions(void);
extern void free_bootmem_with_active_regions(int nid,
unsigned long max_low_pfn);
int add_from_early_node_map(struct range *range, int az,
int nr_range, int nid);
extern void sparse_memory_present_with_active_regions(int nid);
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
......
......@@ -365,6 +365,7 @@ struct zone {
*/
unsigned int compact_considered;
unsigned int compact_defer_shift;
int compact_order_failed;
#endif
ZONE_PADDING(_pad1_)
......
......@@ -49,7 +49,7 @@ extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
int order, nodemask_t *mask);
int order, nodemask_t *mask, bool force_kill);
extern int register_oom_notifier(struct notifier_block *nb);
extern int unregister_oom_notifier(struct notifier_block *nb);
......
......@@ -414,11 +414,26 @@ static inline int PageTransHuge(struct page *page)
return PageHead(page);
}
/*
* PageTransCompound returns true for both transparent huge pages
* and hugetlbfs pages, so it should only be called when it's known
* that hugetlbfs pages aren't involved.
*/
static inline int PageTransCompound(struct page *page)
{
return PageCompound(page);
}
/*
* PageTransTail returns true for both transparent huge pages
* and hugetlbfs pages, so it should only be called when it's known
* that hugetlbfs pages aren't involved.
*/
static inline int PageTransTail(struct page *page)
{
return PageTail(page);
}
#else
static inline int PageTransHuge(struct page *page)
......@@ -430,6 +445,11 @@ static inline int PageTransCompound(struct page *page)
{
return 0;
}
static inline int PageTransTail(struct page *page)
{
return 0;
}
#endif
#ifdef CONFIG_MMU
......
......@@ -4,12 +4,8 @@
enum {
/* flags for mem_cgroup */
PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */
PCG_CACHE, /* charged as cache */
PCG_USED, /* this object is in use. */
PCG_MIGRATION, /* under page migration */
/* flags for mem_cgroup and file and I/O status */
PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */
PCG_FILE_MAPPED, /* page is accounted as "mapped" */
__NR_PCG_FLAGS,
};
......@@ -64,19 +60,10 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \
{ return test_and_clear_bit(PCG_##lname, &pc->flags); }
/* Cache flag is set only once (at allocation) */
TESTPCGFLAG(Cache, CACHE)
CLEARPCGFLAG(Cache, CACHE)
SETPCGFLAG(Cache, CACHE)
TESTPCGFLAG(Used, USED)
CLEARPCGFLAG(Used, USED)
SETPCGFLAG(Used, USED)
SETPCGFLAG(FileMapped, FILE_MAPPED)
CLEARPCGFLAG(FileMapped, FILE_MAPPED)
TESTPCGFLAG(FileMapped, FILE_MAPPED)
SETPCGFLAG(Migration, MIGRATION)
CLEARPCGFLAG(Migration, MIGRATION)
TESTPCGFLAG(Migration, MIGRATION)
......@@ -85,7 +72,7 @@ static inline void lock_page_cgroup(struct page_cgroup *pc)
{
/*
* Don't take this lock in IRQ context.
* This lock is for pc->mem_cgroup, USED, CACHE, MIGRATION
* This lock is for pc->mem_cgroup, USED, MIGRATION
*/
bit_spin_lock(PCG_LOCK, &pc->flags);
}
......@@ -95,24 +82,6 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc)
bit_spin_unlock(PCG_LOCK, &pc->flags);
}
static inline void move_lock_page_cgroup(struct page_cgroup *pc,
unsigned long *flags)
{
/*
* We know updates to pc->flags of page cache's stats are from both of
* usual context or IRQ context. Disable IRQ to avoid deadlock.
*/
local_irq_save(*flags);
bit_spin_lock(PCG_MOVE_LOCK, &pc->flags);
}
static inline void move_unlock_page_cgroup(struct page_cgroup *pc,
unsigned long *flags)
{
bit_spin_unlock(PCG_MOVE_LOCK, &pc->flags);
local_irq_restore(*flags);
}
#else /* CONFIG_CGROUP_MEM_RES_CTLR */
struct page_cgroup;
......
......@@ -122,7 +122,6 @@ void unlink_anon_vmas(struct vm_area_struct *);
int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
void anon_vma_moveto_tail(struct vm_area_struct *);
int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
void __anon_vma_link(struct vm_area_struct *);
static inline void anon_vma_merge(struct vm_area_struct *vma,
struct vm_area_struct *next)
......
......@@ -1514,7 +1514,7 @@ struct task_struct {
#endif
#ifdef CONFIG_CPUSETS
nodemask_t mems_allowed; /* Protected by alloc_lock */
int mems_allowed_change_disable;
seqcount_t mems_allowed_seq; /* Seqence no to catch updates */
int cpuset_mem_spread_rotor;
int cpuset_slab_spread_rotor;
#endif
......
......@@ -223,6 +223,7 @@ extern void lru_add_page_tail(struct zone* zone,
extern void activate_page(struct page *);
extern void mark_page_accessed(struct page *);
extern void lru_add_drain(void);
extern void lru_add_drain_cpu(int cpu);
extern int lru_add_drain_all(void);
extern void rotate_reclaimable_page(struct page *page);
extern void deactivate_page(struct page *page);
......@@ -329,7 +330,6 @@ extern long total_swap_pages;
extern void si_swapinfo(struct sysinfo *);
extern swp_entry_t get_swap_page(void);
extern swp_entry_t get_swap_page_of_type(int);
extern int valid_swaphandles(swp_entry_t, unsigned long *);
extern int add_swap_count_continuation(swp_entry_t, gfp_t);
extern void swap_shmem_alloc(swp_entry_t);
extern int swap_duplicate(swp_entry_t);
......
......@@ -482,7 +482,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
/* hugetlb_file_setup applies strict accounting */
if (shmflg & SHM_NORESERVE)
acctflag = VM_NORESERVE;
file = hugetlb_file_setup(name, size, acctflag,
file = hugetlb_file_setup(name, 0, size, acctflag,
&shp->mlock_user, HUGETLB_SHMFS_INODE);
} else {
/*
......
......@@ -4881,9 +4881,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
rcu_assign_pointer(id->css, NULL);
rcu_assign_pointer(css->id, NULL);
write_lock(&ss->id_lock);
spin_lock(&ss->id_lock);
idr_remove(&ss->idr, id->id);
write_unlock(&ss->id_lock);
spin_unlock(&ss->id_lock);
kfree_rcu(id, rcu_head);
}
EXPORT_SYMBOL_GPL(free_css_id);
......@@ -4909,10 +4909,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
error = -ENOMEM;
goto err_out;
}
write_lock(&ss->id_lock);
spin_lock(&ss->id_lock);
/* Don't use 0. allocates an ID of 1-65535 */
error = idr_get_new_above(&ss->idr, newid, 1, &myid);
write_unlock(&ss->id_lock);
spin_unlock(&ss->id_lock);
/* Returns error when there are no free spaces for new ID.*/
if (error) {
......@@ -4927,9 +4927,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
return newid;
remove_idr:
error = -ENOSPC;
write_lock(&ss->id_lock);
spin_lock(&ss->id_lock);
idr_remove(&ss->idr, myid);
write_unlock(&ss->id_lock);
spin_unlock(&ss->id_lock);
err_out:
kfree(newid);
return ERR_PTR(error);
......@@ -4941,7 +4941,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
{
struct css_id *newid;
rwlock_init(&ss->id_lock);
spin_lock_init(&ss->id_lock);
idr_init(&ss->idr);
newid = get_new_cssid(ss, 0);
......@@ -5029,6 +5029,8 @@ css_get_next(struct cgroup_subsys *ss, int id,
return NULL;
BUG_ON(!ss->use_id);
WARN_ON_ONCE(!rcu_read_lock_held());
/* fill start point for scan */
tmpid = id;
while (1) {
......@@ -5036,10 +5038,7 @@ css_get_next(struct cgroup_subsys *ss, int id,
* scan next entry from bitmap(tree), tmpid is updated after
* idr_get_next().
*/
read_lock(&ss->id_lock);
tmp = idr_get_next(&ss->idr, &tmpid);
read_unlock(&ss->id_lock);
if (!tmp)
break;
if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
......
......@@ -964,7 +964,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
{
bool need_loop;
repeat:
/*
* Allow tasks that have access to memory reserves because they have
* been OOM killed to get memory anywhere.
......@@ -983,45 +982,19 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
*/
need_loop = task_has_mempolicy(tsk) ||
!nodes_intersects(*newmems, tsk->mems_allowed);
nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
/*
* ensure checking ->mems_allowed_change_disable after setting all new
* allowed nodes.
*
* the read-side task can see an nodemask with new allowed nodes and
* old allowed nodes. and if it allocates page when cpuset clears newly
* disallowed ones continuous, it can see the new allowed bits.
*
* And if setting all new allowed nodes is after the checking, setting
* all new allowed nodes and clearing newly disallowed ones will be done
* continuous, and the read-side task may find no node to alloc page.
*/
smp_mb();
if (need_loop)
write_seqcount_begin(&tsk->mems_allowed_seq);
/*
* Allocation of memory is very fast, we needn't sleep when waiting
* for the read-side.
*/
while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
task_unlock(tsk);
if (!task_curr(tsk))
yield();
goto repeat;
}
/*
* ensure checking ->mems_allowed_change_disable before clearing all new
* disallowed nodes.
*
* if clearing newly disallowed bits before the checking, the read-side
* task may find no node to alloc page.
*/
smp_mb();
nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
tsk->mems_allowed = *newmems;
if (need_loop)
write_seqcount_end(&tsk->mems_allowed_seq);
task_unlock(tsk);
}
......
......@@ -935,7 +935,7 @@ void do_exit(long code)
acct_update_integrals(tsk);
/* sync mm's RSS info before statistics gathering */
if (tsk->mm)
sync_mm_rss(tsk, tsk->mm);
sync_mm_rss(tsk->mm);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
hrtimer_cancel(&tsk->signal->real_timer);
......
......@@ -512,6 +512,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
return NULL;
}
static void check_mm(struct mm_struct *mm)
{
int i;
for (i = 0; i < NR_MM_COUNTERS; i++) {
long x = atomic_long_read(&mm->rss_stat.count[i]);
if (unlikely(x))
printk(KERN_ALERT "BUG: Bad rss-counter state "
"mm:%p idx:%d val:%ld\n", mm, i, x);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
VM_BUG_ON(mm->pmd_huge_pte);
#endif
}
/*
* Allocate and initialize an mm_struct.
*/
......@@ -539,9 +556,7 @@ void __mmdrop(struct mm_struct *mm)
mm_free_pgd(mm);
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
VM_BUG_ON(mm->pmd_huge_pte);
#endif
check_mm(mm);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
......@@ -1223,6 +1238,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
seqcount_init(&p->mems_allowed_seq);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = 0;
......
......@@ -595,8 +595,10 @@ EXPORT_SYMBOL(idr_for_each);
* Returns pointer to registered object with id, which is next number to
* given id. After being looked up, *@nextidp will be updated for the next
* iteration.
*
* This function can be called under rcu_read_lock(), given that the leaf
* pointers lifetimes are correctly managed.
*/
void *idr_get_next(struct idr *idp, int *nextidp)
{
struct idr_layer *p, *pa[MAX_LEVEL];
......@@ -605,11 +607,11 @@ void *idr_get_next(struct idr *idp, int *nextidp)
int n, max;
/* find first ent */
n = idp->layers * IDR_BITS;
max = 1 << n;
p = rcu_dereference_raw(idp->top);
if (!p)
return NULL;
n = (p->layer + 1) * IDR_BITS;
max = 1 << n;
while (id < max) {
while (n > 0 && p) {
......
......@@ -766,14 +766,13 @@ void * __init alloc_bootmem_section(unsigned long size,
unsigned long section_nr)
{
bootmem_data_t *bdata;
unsigned long pfn, goal, limit;
unsigned long pfn, goal;
pfn = section_nr_to_pfn(section_nr);
goal = pfn << PAGE_SHIFT;
limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0);
}
#endif
......
......@@ -35,7 +35,7 @@ struct compact_control {
unsigned long migrate_pfn; /* isolate_migratepages search base */
bool sync; /* Synchronous migration */
unsigned int order; /* order a direct compactor needs */
int order; /* order a direct compactor needs */
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
};
......@@ -675,49 +675,71 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
/* Compact all zones within a node */
static int compact_node(int nid)
static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
{
int zoneid;
pg_data_t *pgdat;
struct zone *zone;
if (nid < 0 || nid >= nr_node_ids || !node_online(nid))
return -EINVAL;
pgdat = NODE_DATA(nid);
/* Flush pending updates to the LRU lists */
lru_add_drain_all();
for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
struct compact_control cc = {
.nr_freepages = 0,
.nr_migratepages = 0,
.order = -1,
.sync = true,
};
zone = &pgdat->node_zones[zoneid];
if (!populated_zone(zone))
continue;
cc.zone = zone;
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
compact_zone(zone, &cc);
cc->nr_freepages = 0;
cc->nr_migratepages = 0;
cc->zone = zone;
INIT_LIST_HEAD(&cc->freepages);
INIT_LIST_HEAD(&cc->migratepages);
if (cc->order == -1 || !compaction_deferred(zone, cc->order))
compact_zone(zone, cc);
if (cc->order > 0) {
int ok = zone_watermark_ok(zone, cc->order,
low_wmark_pages(zone), 0, 0);
if (ok && cc->order > zone->compact_order_failed)
zone->compact_order_failed = cc->order + 1;
/* Currently async compaction is never deferred. */
else if (!ok && cc->sync)
defer_compaction(zone, cc->order);
}
VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));
VM_BUG_ON(!list_empty(&cc->freepages));
VM_BUG_ON(!list_empty(&cc->migratepages));
}
return 0;
}
int compact_pgdat(pg_data_t *pgdat, int order)
{
struct compact_control cc = {
.order = order,
.sync = false,
};
return __compact_pgdat(pgdat, &cc);
}
static int compact_node(int nid)
{
struct compact_control cc = {
.order = -1,
.sync = true,
};
return __compact_pgdat(NODE_DATA(nid), &cc);
}
/* Compact all nodes in the system */
static int compact_nodes(void)
{
int nid;
/* Flush pending updates to the LRU lists */
lru_add_drain_all();
for_each_online_node(nid)
compact_node(nid);
......@@ -750,7 +772,14 @@ ssize_t sysfs_compact_node(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
compact_node(dev->id);
int nid = dev->id;
if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
/* Flush pending updates to the LRU lists */
lru_add_drain_all();
compact_node(nid);
}
return count;
}
......
......@@ -101,9 +101,8 @@
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
*
* (code doesn't rely on that order, so you could switch it around)
* ->tasklist_lock (memory_failure, collect_procs_ao)
* ->i_mmap_mutex
* ->i_mmap_mutex
* ->tasklist_lock (memory_failure, collect_procs_ao)
*/
/*
......@@ -500,10 +499,13 @@ struct page *__page_cache_alloc(gfp_t gfp)
struct page *page;
if (cpuset_do_page_mem_spread()) {
get_mems_allowed();
n = cpuset_mem_spread_node();
page = alloc_pages_exact_node(n, gfp, 0);
put_mems_allowed();
unsigned int cpuset_mems_cookie;
do {
cpuset_mems_cookie = get_mems_allowed();
n = cpuset_mem_spread_node();
page = alloc_pages_exact_node(n, gfp, 0);
} while (!put_mems_allowed(cpuset_mems_cookie) && !page);
return page;
}
return alloc_pages(gfp, 0);
......@@ -2341,7 +2343,9 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
struct page *page;
gfp_t gfp_notmask = 0;
gfp_mask = mapping_gfp_mask(mapping) | __GFP_WRITE;
gfp_mask = mapping_gfp_mask(mapping);
if (mapping_cap_account_dirty(mapping))
gfp_mask |= __GFP_WRITE;
if (flags & AOP_FLAG_NOFS)
gfp_notmask = __GFP_FS;
repeat:
......
......@@ -1031,32 +1031,23 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
{
int ret = 0;
spin_lock(&tlb->mm->page_table_lock);
if (likely(pmd_trans_huge(*pmd))) {
if (unlikely(pmd_trans_splitting(*pmd))) {
spin_unlock(&tlb->mm->page_table_lock);
wait_split_huge_page(vma->anon_vma,
pmd);
} else {
struct page *page;
pgtable_t pgtable;
pgtable = get_pmd_huge_pte(tlb->mm);
page = pmd_page(*pmd);
pmd_clear(pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
page_remove_rmap(page);
VM_BUG_ON(page_mapcount(page) < 0);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
VM_BUG_ON(!PageHead(page));
tlb->mm->nr_ptes--;
spin_unlock(&tlb->mm->page_table_lock);
tlb_remove_page(tlb, page);
pte_free(tlb->mm, pgtable);
ret = 1;
}
} else
if (__pmd_trans_huge_lock(pmd, vma) == 1) {
struct page *page;
pgtable_t pgtable;
pgtable = get_pmd_huge_pte(tlb->mm);
page = pmd_page(*pmd);
pmd_clear(pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
page_remove_rmap(page);
VM_BUG_ON(page_mapcount(page) < 0);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
VM_BUG_ON(!PageHead(page));
tlb->mm->nr_ptes--;
spin_unlock(&tlb->mm->page_table_lock);
tlb_remove_page(tlb, page);
pte_free(tlb->mm, pgtable);
ret = 1;
}
return ret;
}
......@@ -1066,21 +1057,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
{
int ret = 0;
spin_lock(&vma->vm_mm->page_table_lock);
if (likely(pmd_trans_huge(*pmd))) {
ret = !pmd_trans_splitting(*pmd);
spin_unlock(&vma->vm_mm->page_table_lock);
if (unlikely(!ret))
wait_split_huge_page(vma->anon_vma, pmd);
else {
/*
* All logical pages in the range are present
* if backed by a huge page.
*/
memset(vec, 1, (end - addr) >> PAGE_SHIFT);
}
} else
if (__pmd_trans_huge_lock(pmd, vma) == 1) {
/*
* All logical pages in the range are present
* if backed by a huge page.
*/
spin_unlock(&vma->vm_mm->page_table_lock);
memset(vec, 1, (end - addr) >> PAGE_SHIFT);
ret = 1;
}
return ret;
}
......@@ -1110,20 +1095,11 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
goto out;
}
spin_lock(&mm->page_table_lock);
if (likely(pmd_trans_huge(*old_pmd))) {
if (pmd_trans_splitting(*old_pmd)) {
spin_unlock(&mm->page_table_lock);
wait_split_huge_page(vma->anon_vma, old_pmd);
ret = -1;
} else {
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
set_pmd_at(mm, new_addr, new_pmd, pmd);
spin_unlock(&mm->page_table_lock);
ret = 1;
}
} else {
ret = __pmd_trans_huge_lock(old_pmd, vma);
if (ret == 1) {
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
set_pmd_at(mm, new_addr, new_pmd, pmd);
spin_unlock(&mm->page_table_lock);
}
out:
......@@ -1136,24 +1112,41 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
struct mm_struct *mm = vma->vm_mm;
int ret = 0;
spin_lock(&mm->page_table_lock);
if (__pmd_trans_huge_lock(pmd, vma) == 1) {
pmd_t entry;
entry = pmdp_get_and_clear(mm, addr, pmd);
entry = pmd_modify(entry, newprot);
set_pmd_at(mm, addr, pmd, entry);
spin_unlock(&vma->vm_mm->page_table_lock);
ret = 1;
}
return ret;
}
/*
* Returns 1 if a given pmd maps a stable (not under splitting) thp.
* Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
*
* Note that if it returns 1, this routine returns without unlocking page
* table locks. So callers must unlock them.
*/
int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
{
spin_lock(&vma->vm_mm->page_table_lock);
if (likely(pmd_trans_huge(*pmd))) {
if (unlikely(pmd_trans_splitting(*pmd))) {
spin_unlock(&mm->page_table_lock);
spin_unlock(&vma->vm_mm->page_table_lock);
wait_split_huge_page(vma->anon_vma, pmd);
return -1;
} else {
pmd_t entry;
entry = pmdp_get_and_clear(mm, addr, pmd);
entry = pmd_modify(entry, newprot);
set_pmd_at(mm, addr, pmd, entry);
spin_unlock(&vma->vm_mm->page_table_lock);
ret = 1;
/* Thp mapped by 'pmd' is stable, so we can
* handle it as it is. */
return 1;
}
} else
spin_unlock(&vma->vm_mm->page_table_lock);
return ret;
}
spin_unlock(&vma->vm_mm->page_table_lock);
return 0;
}
pmd_t *page_check_address_pmd(struct page *page,
......
......@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size;
*/
static DEFINE_SPINLOCK(hugetlb_lock);
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
{
bool free = (spool->count == 0) && (spool->used_hpages == 0);
spin_unlock(&spool->lock);
/* If no pages are used, and no other handles to the subpool
* remain, free the subpool the subpool remain */
if (free)
kfree(spool);
}
struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
{
struct hugepage_subpool *spool;
spool = kmalloc(sizeof(*spool), GFP_KERNEL);
if (!spool)
return NULL;
spin_lock_init(&spool->lock);
spool->count = 1;
spool->max_hpages = nr_blocks;
spool->used_hpages = 0;
return spool;
}
void hugepage_put_subpool(struct hugepage_subpool *spool)
{
spin_lock(&spool->lock);
BUG_ON(!spool->count);
spool->count--;
unlock_or_release_subpool(spool);
}
static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
long delta)
{
int ret = 0;
if (!spool)
return 0;
spin_lock(&spool->lock);
if ((spool->used_hpages + delta) <= spool->max_hpages) {
spool->used_hpages += delta;
} else {
ret = -ENOMEM;
}
spin_unlock(&spool->lock);
return ret;
}
static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
long delta)
{
if (!spool)
return;
spin_lock(&spool->lock);
spool->used_hpages -= delta;
/* If hugetlbfs_put_super couldn't free spool due to
* an outstanding quota reference, free it now. */
unlock_or_release_subpool(spool);
}
static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
{
return HUGETLBFS_SB(inode->i_sb)->spool;
}
static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
{
return subpool_inode(vma->vm_file->f_dentry->d_inode);
}
/*
* Region tracking -- allows tracking of reservations and instantiated pages
* across the pages in a mapping.
......@@ -454,14 +532,16 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve)
{
struct page *page = NULL;
struct page *page;
struct mempolicy *mpol;
nodemask_t *nodemask;
struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
unsigned int cpuset_mems_cookie;
get_mems_allowed();
retry_cpuset:
cpuset_mems_cookie = get_mems_allowed();
zonelist = huge_zonelist(vma, address,
htlb_alloc_mask, &mpol, &nodemask);
/*
......@@ -488,10 +568,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
}
}
}
err:
mpol_cond_put(mpol);
put_mems_allowed();
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
return page;
err:
mpol_cond_put(mpol);
return NULL;
}
static void update_and_free_page(struct hstate *h, struct page *page)
......@@ -533,9 +618,9 @@ static void free_huge_page(struct page *page)
*/
struct hstate *h = page_hstate(page);
int nid = page_to_nid(page);
struct address_space *mapping;
struct hugepage_subpool *spool =
(struct hugepage_subpool *)page_private(page);
mapping = (struct address_space *) page_private(page);
set_page_private(page, 0);
page->mapping = NULL;
BUG_ON(page_count(page));
......@@ -551,8 +636,7 @@ static void free_huge_page(struct page *page)
enqueue_huge_page(h, page);
}
spin_unlock(&hugetlb_lock);
if (mapping)
hugetlb_put_quota(mapping, 1);
hugepage_subpool_put_pages(spool, 1);
}
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
......@@ -852,6 +936,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
struct page *page, *tmp;
int ret, i;
int needed, allocated;
bool alloc_ok = true;
needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
if (needed <= 0) {
......@@ -867,17 +952,13 @@ static int gather_surplus_pages(struct hstate *h, int delta)
spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) {
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
if (!page)
/*
* We were not able to allocate enough pages to
* satisfy the entire reservation so we free what
* we've allocated so far.
*/
goto free;
if (!page) {
alloc_ok = false;
break;
}
list_add(&page->lru, &surplus_list);
}
allocated += needed;
allocated += i;
/*
* After retaking hugetlb_lock, we need to recalculate 'needed'
......@@ -886,9 +967,16 @@ static int gather_surplus_pages(struct hstate *h, int delta)
spin_lock(&hugetlb_lock);
needed = (h->resv_huge_pages + delta) -
(h->free_huge_pages + allocated);
if (needed > 0)
goto retry;
if (needed > 0) {
if (alloc_ok)
goto retry;
/*
* We were not able to allocate enough pages to
* satisfy the entire reservation so we free what
* we've allocated so far.
*/
goto free;
}
/*
* The surplus_list now contains _at_least_ the number of extra pages
* needed to accommodate the reservation. Add the appropriate number
......@@ -914,10 +1002,10 @@ static int gather_surplus_pages(struct hstate *h, int delta)
VM_BUG_ON(page_count(page));
enqueue_huge_page(h, page);
}
free:
spin_unlock(&hugetlb_lock);
/* Free unnecessary surplus pages to the buddy allocator */
free:
if (!list_empty(&surplus_list)) {
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
list_del(&page->lru);
......@@ -966,11 +1054,12 @@ static void return_unused_surplus_pages(struct hstate *h,
/*
* Determine if the huge page at addr within the vma has an associated
* reservation. Where it does not we will need to logically increase
* reservation and actually increase quota before an allocation can occur.
* Where any new reservation would be required the reservation change is
* prepared, but not committed. Once the page has been quota'd allocated
* an instantiated the change should be committed via vma_commit_reservation.
* No action is required on failure.
* reservation and actually increase subpool usage before an allocation
* can occur. Where any new reservation would be required the
* reservation change is prepared, but not committed. Once the page
* has been allocated from the subpool and instantiated the change should
* be committed via vma_commit_reservation. No action is required on
* failure.
*/
static long vma_needs_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
......@@ -1019,24 +1108,24 @@ static void vma_commit_reservation(struct hstate *h,
static struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve)
{
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct page *page;
struct address_space *mapping = vma->vm_file->f_mapping;
struct inode *inode = mapping->host;
long chg;
/*
* Processes that did not create the mapping will have no reserves and
* will not have accounted against quota. Check that the quota can be
* made before satisfying the allocation
* MAP_NORESERVE mappings may also need pages and quota allocated
* if no reserve mapping overlaps.
* Processes that did not create the mapping will have no
* reserves and will not have accounted against subpool
* limit. Check that the subpool limit can be made before
* satisfying the allocation MAP_NORESERVE mappings may also
* need pages and subpool limit allocated allocated if no reserve
* mapping overlaps.
*/
chg = vma_needs_reservation(h, vma, addr);
if (chg < 0)
return ERR_PTR(-VM_FAULT_OOM);
if (chg)
if (hugetlb_get_quota(inode->i_mapping, chg))
if (hugepage_subpool_get_pages(spool, chg))
return ERR_PTR(-VM_FAULT_SIGBUS);
spin_lock(&hugetlb_lock);
......@@ -1046,12 +1135,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
if (!page) {
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
if (!page) {
hugetlb_put_quota(inode->i_mapping, chg);
hugepage_subpool_put_pages(spool, chg);
return ERR_PTR(-VM_FAULT_SIGBUS);
}
}
set_page_private(page, (unsigned long) mapping);
set_page_private(page, (unsigned long)spool);
vma_commit_reservation(h, vma, addr);
......@@ -2072,6 +2161,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
struct hstate *h = hstate_vma(vma);
struct resv_map *reservations = vma_resv_map(vma);
struct hugepage_subpool *spool = subpool_vma(vma);
unsigned long reserve;
unsigned long start;
unsigned long end;
......@@ -2087,7 +2177,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
if (reserve) {
hugetlb_acct_memory(h, -reserve);
hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
hugepage_subpool_put_pages(spool, reserve);
}
}
}
......@@ -2276,6 +2366,10 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
if (pte_dirty(pte))
set_page_dirty(page);
list_add(&page->lru, &page_list);
/* Bail out after unmapping reference page if supplied */
if (ref_page)
break;
}
flush_tlb_range(vma, start, end);
spin_unlock(&mm->page_table_lock);
......@@ -2316,7 +2410,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
*/
address = address & huge_page_mask(h);
pgoff = vma_hugecache_offset(h, vma, address);
mapping = (struct address_space *)page_private(page);
mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
/*
* Take the mapping lock for the duration of the table walk. As
......@@ -2869,11 +2963,12 @@ int hugetlb_reserve_pages(struct inode *inode,
{
long ret, chg;
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
/*
* Only apply hugepage reservation if asked. At fault time, an
* attempt will be made for VM_NORESERVE to allocate a page
* and filesystem quota without using reserves
* without using reserves
*/
if (vm_flags & VM_NORESERVE)
return 0;
......@@ -2900,17 +2995,17 @@ int hugetlb_reserve_pages(struct inode *inode,
if (chg < 0)
return chg;
/* There must be enough filesystem quota for the mapping */
if (hugetlb_get_quota(inode->i_mapping, chg))
/* There must be enough pages in the subpool for the mapping */
if (hugepage_subpool_get_pages(spool, chg))
return -ENOSPC;
/*
* Check enough hugepages are available for the reservation.
* Hand back the quota if there are not
* Hand the pages back to the subpool if there are not
*/
ret = hugetlb_acct_memory(h, chg);
if (ret < 0) {
hugetlb_put_quota(inode->i_mapping, chg);
hugepage_subpool_put_pages(spool, chg);
return ret;
}
......@@ -2934,12 +3029,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
{
struct hstate *h = hstate_inode(inode);
long chg = region_truncate(&inode->i_mapping->private_list, offset);
struct hugepage_subpool *spool = subpool_inode(inode);
spin_lock(&inode->i_lock);
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
spin_unlock(&inode->i_lock);
hugetlb_put_quota(inode->i_mapping, (chg - freed));
hugepage_subpool_put_pages(spool, (chg - freed));
hugetlb_acct_memory(h, -(chg - freed));
}
......
......@@ -374,6 +374,20 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
}
static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
unsigned long addr)
{
struct vm_area_struct *vma;
if (ksm_test_exit(mm))
return NULL;
vma = find_vma(mm, addr);
if (!vma || vma->vm_start > addr)
return NULL;
if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
return NULL;
return vma;
}
static void break_cow(struct rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
......@@ -387,15 +401,9 @@ static void break_cow(struct rmap_item *rmap_item)
put_anon_vma(rmap_item->anon_vma);
down_read(&mm->mmap_sem);
if (ksm_test_exit(mm))
goto out;
vma = find_vma(mm, addr);
if (!vma || vma->vm_start > addr)
goto out;
if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
goto out;
break_ksm(vma, addr);
out:
vma = find_mergeable_vma(mm, addr);
if (vma)
break_ksm(vma, addr);
up_read(&mm->mmap_sem);
}
......@@ -421,12 +429,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
struct page *page;
down_read(&mm->mmap_sem);
if (ksm_test_exit(mm))
goto out;
vma = find_vma(mm, addr);
if (!vma || vma->vm_start > addr)
goto out;
if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
vma = find_mergeable_vma(mm, addr);
if (!vma)
goto out;
page = follow_page(vma, addr, FOLL_GET);
......
此差异已折叠。
......@@ -1063,7 +1063,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
*/
if (!PageHuge(p) && !PageTransCompound(p)) {
if (!PageHuge(p) && !PageTransTail(p)) {
if (!PageLRU(p))
shake_page(p, 0);
if (!PageLRU(p)) {
......
......@@ -125,17 +125,17 @@ core_initcall(init_zero_pfn);
#if defined(SPLIT_RSS_COUNTING)
static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
void sync_mm_rss(struct mm_struct *mm)
{
int i;
for (i = 0; i < NR_MM_COUNTERS; i++) {
if (task->rss_stat.count[i]) {
add_mm_counter(mm, i, task->rss_stat.count[i]);
task->rss_stat.count[i] = 0;
if (current->rss_stat.count[i]) {
add_mm_counter(mm, i, current->rss_stat.count[i]);
current->rss_stat.count[i] = 0;
}
}
task->rss_stat.events = 0;
current->rss_stat.events = 0;
}
static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
......@@ -157,30 +157,7 @@ static void check_sync_rss_stat(struct task_struct *task)
if (unlikely(task != current))
return;
if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
__sync_task_rss_stat(task, task->mm);
}
unsigned long get_mm_counter(struct mm_struct *mm, int member)
{
long val = 0;
/*
* Don't use task->mm here...for avoiding to use task_get_mm()..
* The caller must guarantee task->mm is not invalid.
*/
val = atomic_long_read(&mm->rss_stat.count[member]);
/*
* counter is updated in asynchronous manner and may go to minus.
* But it's never be expected number for users.
*/
if (val < 0)
return 0;
return (unsigned long)val;
}
void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
{
__sync_task_rss_stat(task, mm);
sync_mm_rss(task->mm);
}
#else /* SPLIT_RSS_COUNTING */
......@@ -661,7 +638,7 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
int i;
if (current->mm == mm)
sync_mm_rss(current, mm);
sync_mm_rss(mm);
for (i = 0; i < NR_MM_COUNTERS; i++)
if (rss[i])
add_mm_counter(mm, i, rss[i]);
......@@ -1247,16 +1224,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
do {
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd)) {
if (next-addr != HPAGE_PMD_SIZE) {
if (next - addr != HPAGE_PMD_SIZE) {
VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
split_huge_page_pmd(vma->vm_mm, pmd);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
continue;
goto next;
/* fall through */
}
if (pmd_none_or_clear_bad(pmd))
continue;
/*
* Here there can be other concurrent MADV_DONTNEED or
* trans huge page faults running, and if the pmd is
* none or trans huge it can change under us. This is
* because MADV_DONTNEED holds the mmap_sem in read
* mode.
*/
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
goto next;
next = zap_pte_range(tlb, vma, pmd, addr, next, details);
next:
cond_resched();
} while (pmd++, addr = next, addr != end);
......
......@@ -512,7 +512,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
do {
next = pmd_addr_end(addr, end);
split_huge_page_pmd(vma->vm_mm, pmd);
if (pmd_none_or_clear_bad(pmd))
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
continue;
if (check_pte_range(vma, pmd, addr, next, nodes,
flags, private))
......@@ -1323,12 +1323,9 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
err = -ESRCH;
goto out;
}
mm = get_task_mm(task);
rcu_read_unlock();
get_task_struct(task);
err = -EINVAL;
if (!mm)
goto out;
/*
* Check if this process has the right to modify the specified
......@@ -1336,14 +1333,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
* capabilities, superuser privileges or the same
* userid as the target process.
*/
rcu_read_lock();
tcred = __task_cred(task);
if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
cred->uid != tcred->suid && cred->uid != tcred->uid &&
!capable(CAP_SYS_NICE)) {
rcu_read_unlock();
err = -EPERM;
goto out;
goto out_put;
}
rcu_read_unlock();
......@@ -1351,26 +1347,36 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
/* Is the user allowed to access the target nodes? */
if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
err = -EPERM;
goto out;
goto out_put;
}
if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
err = -EINVAL;
goto out;
goto out_put;
}
err = security_task_movememory(task);
if (err)
goto out;
goto out_put;
err = do_migrate_pages(mm, old, new,
capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
out:
mm = get_task_mm(task);
put_task_struct(task);
if (mm)
mmput(mm);
err = do_migrate_pages(mm, old, new,
capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
else
err = -EINVAL;
mmput(mm);
out:
NODEMASK_SCRATCH_FREE(scratch);
return err;
out_put:
put_task_struct(task);
goto out;
}
......@@ -1844,18 +1850,24 @@ struct page *
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, int node)
{
struct mempolicy *pol = get_vma_policy(current, vma, addr);
struct mempolicy *pol;
struct zonelist *zl;
struct page *page;
unsigned int cpuset_mems_cookie;
retry_cpuset:
pol = get_vma_policy(current, vma, addr);
cpuset_mems_cookie = get_mems_allowed();
get_mems_allowed();
if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
mpol_cond_put(pol);
page = alloc_page_interleave(gfp, order, nid);
put_mems_allowed();
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
return page;
}
zl = policy_zonelist(gfp, pol, node);
......@@ -1866,7 +1878,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
struct page *page = __alloc_pages_nodemask(gfp, order,
zl, policy_nodemask(gfp, pol));
__mpol_put(pol);
put_mems_allowed();
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
return page;
}
/*
......@@ -1874,7 +1887,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
*/
page = __alloc_pages_nodemask(gfp, order, zl,
policy_nodemask(gfp, pol));
put_mems_allowed();
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
return page;
}
......@@ -1901,11 +1915,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = current->mempolicy;
struct page *page;
unsigned int cpuset_mems_cookie;
if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
pol = &default_policy;
get_mems_allowed();
retry_cpuset:
cpuset_mems_cookie = get_mems_allowed();
/*
* No reference counting needed for current->mempolicy
* nor system default_policy
......@@ -1916,7 +1933,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
page = __alloc_pages_nodemask(gfp, order,
policy_zonelist(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));
put_mems_allowed();
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
return page;
}
EXPORT_SYMBOL(alloc_pages_current);
......
......@@ -1174,20 +1174,17 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
* Migrate an array of page address onto an array of nodes and fill
* the corresponding array of status.
*/
static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
unsigned long nr_pages,
const void __user * __user *pages,
const int __user *nodes,
int __user *status, int flags)
{
struct page_to_node *pm;
nodemask_t task_nodes;
unsigned long chunk_nr_pages;
unsigned long chunk_start;
int err;
task_nodes = cpuset_mems_allowed(task);
err = -ENOMEM;
pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
if (!pm)
......@@ -1349,6 +1346,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
struct task_struct *task;
struct mm_struct *mm;
int err;
nodemask_t task_nodes;
/* Check flags */
if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
......@@ -1364,11 +1362,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
rcu_read_unlock();
return -ESRCH;
}
mm = get_task_mm(task);
rcu_read_unlock();
if (!mm)
return -EINVAL;
get_task_struct(task);
/*
* Check if this process has the right to modify the specified
......@@ -1376,7 +1370,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
* capabilities, superuser privileges or the same
* userid as the target process.
*/
rcu_read_lock();
tcred = __task_cred(task);
if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
cred->uid != tcred->suid && cred->uid != tcred->uid &&
......@@ -1391,16 +1384,25 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
if (err)
goto out;
if (nodes) {
err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
flags);
} else {
err = do_pages_stat(mm, nr_pages, pages, status);
}
task_nodes = cpuset_mems_allowed(task);
mm = get_task_mm(task);
put_task_struct(task);
if (mm) {
if (nodes)
err = do_pages_move(mm, task_nodes, nr_pages, pages,
nodes, status, flags);
else
err = do_pages_stat(mm, nr_pages, pages, status);
} else
err = -EINVAL;
out:
mmput(mm);
return err;
out:
put_task_struct(task);
return err;
}
/*
......
......@@ -164,7 +164,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
}
/* fall through */
}
if (pmd_none_or_clear_bad(pmd))
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
mincore_unmapped_range(vma, addr, next, vec);
else
mincore_pte_range(vma, pmd, addr, next, vec);
......
......@@ -451,9 +451,8 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
}
/*
* Helper for vma_adjust in the split_vma insert case:
* insert vm structure into list and rbtree and anon_vma,
* but it has already been inserted into prio_tree earlier.
* Helper for vma_adjust() in the split_vma insert case: insert a vma into the
* mm's list and rbtree. It has already been inserted into the prio_tree.
*/
static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
......@@ -1112,9 +1111,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
* A dummy user value is used because we are not locking
* memory so no accounting is necessary
*/
len = ALIGN(len, huge_page_size(&default_hstate));
file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
&user, HUGETLB_ANONHUGE_INODE);
file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
VM_NORESERVE, &user,
HUGETLB_ANONHUGE_INODE);
if (IS_ERR(file))
return PTR_ERR(file);
}
......@@ -1439,10 +1438,8 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
/*
* Is this a new hole at the lowest possible address?
*/
if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) {
if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)
mm->free_area_cache = addr;
mm->cached_hole_size = ~0UL;
}
}
/*
......@@ -1457,7 +1454,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
unsigned long addr = addr0;
unsigned long addr = addr0, start_addr;
/* requested length too big for entire address space */
if (len > TASK_SIZE)
......@@ -1481,22 +1478,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
mm->free_area_cache = mm->mmap_base;
}
try_again:
/* either no address requested or can't fit in requested address hole */
addr = mm->free_area_cache;
/* make sure it can fit in the remaining address space */
if (addr > len) {
vma = find_vma(mm, addr-len);
if (!vma || addr <= vma->vm_start)
/* remember the address as a hint for next time */
return (mm->free_area_cache = addr-len);
}
if (mm->mmap_base < len)
goto bottomup;
start_addr = addr = mm->free_area_cache;
addr = mm->mmap_base-len;
if (addr < len)
goto fail;
addr -= len;
do {
/*
* Lookup failure means no vma is above this address,
......@@ -1516,7 +1505,21 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
addr = vma->vm_start-len;
} while (len < vma->vm_start);
bottomup:
fail:
/*
* if hint left us with no space for the requested
* mapping then try again:
*
* Note: this is different with the case of bottomup
* which does the fully line-search, but we use find_vma
* here that causes some holes skipped.
*/
if (start_addr != mm->mmap_base) {
mm->free_area_cache = mm->mmap_base;
mm->cached_hole_size = 0;
goto try_again;
}
/*
* A failed mmap() very likely causes application failure,
* so fall back to the bottom-up function here. This scenario
......
......@@ -53,7 +53,7 @@ void unuse_mm(struct mm_struct *mm)
struct task_struct *tsk = current;
task_lock(tsk);
sync_mm_rss(tsk, mm);
sync_mm_rss(mm);
tsk->mm = NULL;
/* active_mm is still 'mm' */
enter_lazy_tlb(mm, tsk);
......
......@@ -60,7 +60,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
ptent = pte_mkwrite(ptent);
ptep_modify_prot_commit(mm, addr, pte, ptent);
} else if (PAGE_MIGRATION && !pte_file(oldpte)) {
} else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
if (is_write_migration_entry(entry)) {
......
......@@ -34,6 +34,7 @@
#include <linux/ptrace.h>
#include <linux/freezer.h>
#include <linux/ftrace.h>
#include <linux/ratelimit.h>
#define CREATE_TRACE_POINTS
#include <trace/events/oom.h>
......@@ -309,7 +310,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
*/
static struct task_struct *select_bad_process(unsigned int *ppoints,
unsigned long totalpages, struct mem_cgroup *memcg,
const nodemask_t *nodemask)
const nodemask_t *nodemask, bool force_kill)
{
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
......@@ -335,7 +336,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
if (unlikely(frozen(p)))
__thaw_task(p);
return ERR_PTR(-1UL);
if (!force_kill)
return ERR_PTR(-1UL);
}
if (!p->mm)
continue;
......@@ -353,7 +355,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
if (p == current) {
chosen = p;
*ppoints = 1000;
} else {
} else if (!force_kill) {
/*
* If this task is not being ptraced on exit,
* then wait for it to finish before killing
......@@ -434,66 +436,18 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
}
#define K(x) ((x) << (PAGE_SHIFT-10))
static int oom_kill_task(struct task_struct *p)
{
struct task_struct *q;
struct mm_struct *mm;
p = find_lock_task_mm(p);
if (!p)
return 1;
/* mm cannot be safely dereferenced after task_unlock(p) */
mm = p->mm;
pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
task_pid_nr(p), p->comm, K(p->mm->total_vm),
K(get_mm_counter(p->mm, MM_ANONPAGES)),
K(get_mm_counter(p->mm, MM_FILEPAGES)));
task_unlock(p);
/*
* Kill all user processes sharing p->mm in other thread groups, if any.
* They don't get access to memory reserves or a higher scheduler
* priority, though, to avoid depletion of all memory or task
* starvation. This prevents mm->mmap_sem livelock when an oom killed
* task cannot exit because it requires the semaphore and its contended
* by another thread trying to allocate memory itself. That thread will
* now get access to memory reserves since it has a pending fatal
* signal.
*/
for_each_process(q)
if (q->mm == mm && !same_thread_group(q, p) &&
!(q->flags & PF_KTHREAD)) {
if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
continue;
task_lock(q); /* Protect ->comm from prctl() */
pr_err("Kill process %d (%s) sharing same memory\n",
task_pid_nr(q), q->comm);
task_unlock(q);
force_sig(SIGKILL, q);
}
set_tsk_thread_flag(p, TIF_MEMDIE);
force_sig(SIGKILL, p);
return 0;
}
#undef K
static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
unsigned int points, unsigned long totalpages,
struct mem_cgroup *memcg, nodemask_t *nodemask,
const char *message)
static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
unsigned int points, unsigned long totalpages,
struct mem_cgroup *memcg, nodemask_t *nodemask,
const char *message)
{
struct task_struct *victim = p;
struct task_struct *child;
struct task_struct *t = p;
struct mm_struct *mm;
unsigned int victim_points = 0;
if (printk_ratelimit())
dump_header(p, gfp_mask, order, memcg, nodemask);
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
/*
* If the task is already exiting, don't alarm the sysadmin or kill
......@@ -501,9 +455,12 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
*/
if (p->flags & PF_EXITING) {
set_tsk_thread_flag(p, TIF_MEMDIE);
return 0;
return;
}
if (__ratelimit(&oom_rs))
dump_header(p, gfp_mask, order, memcg, nodemask);
task_lock(p);
pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
message, task_pid_nr(p), p->comm, points);
......@@ -533,8 +490,44 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
}
} while_each_thread(p, t);
return oom_kill_task(victim);
victim = find_lock_task_mm(victim);
if (!victim)
return;
/* mm cannot safely be dereferenced after task_unlock(victim) */
mm = victim->mm;
pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
K(get_mm_counter(victim->mm, MM_ANONPAGES)),
K(get_mm_counter(victim->mm, MM_FILEPAGES)));
task_unlock(victim);
/*
* Kill all user processes sharing victim->mm in other thread groups, if
* any. They don't get access to memory reserves, though, to avoid
* depletion of all memory. This prevents mm->mmap_sem livelock when an
* oom killed thread cannot exit because it requires the semaphore and
* its contended by another thread trying to allocate memory itself.
* That thread will now get access to memory reserves since it has a
* pending fatal signal.
*/
for_each_process(p)
if (p->mm == mm && !same_thread_group(p, victim) &&
!(p->flags & PF_KTHREAD)) {
if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
continue;
task_lock(p); /* Protect ->comm from prctl() */
pr_err("Kill process %d (%s) sharing same memory\n",
task_pid_nr(p), p->comm);
task_unlock(p);
force_sig(SIGKILL, p);
}
set_tsk_thread_flag(victim, TIF_MEMDIE);
force_sig(SIGKILL, victim);
}
#undef K
/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
......@@ -561,7 +554,8 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
}
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask)
void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
unsigned long limit;
unsigned int points = 0;
......@@ -577,18 +571,13 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask)
return;
}
check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT;
read_lock(&tasklist_lock);
retry:
p = select_bad_process(&points, limit, memcg, NULL);
if (!p || PTR_ERR(p) == -1UL)
goto out;
if (oom_kill_process(p, gfp_mask, 0, points, limit, memcg, NULL,
"Memory cgroup out of memory"))
goto retry;
out:
p = select_bad_process(&points, limit, memcg, NULL, false);
if (p && PTR_ERR(p) != -1UL)
oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL,
"Memory cgroup out of memory");
read_unlock(&tasklist_lock);
}
#endif
......@@ -700,6 +689,7 @@ static void clear_system_oom(void)
* @gfp_mask: memory allocation flags
* @order: amount of memory being requested as a power of 2
* @nodemask: nodemask passed to page allocator
* @force_kill: true if a task must be killed, even if others are exiting
*
* If we run out of memory, we have the choice between either
* killing a random task (bad), letting the system crash (worse)
......@@ -707,7 +697,7 @@ static void clear_system_oom(void)
* don't have to be perfect here, we just have to be good.
*/
void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
int order, nodemask_t *nodemask)
int order, nodemask_t *nodemask, bool force_kill)
{
const nodemask_t *mpol_mask;
struct task_struct *p;
......@@ -745,33 +735,25 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
if (sysctl_oom_kill_allocating_task &&
!oom_unkillable_task(current, NULL, nodemask) &&
current->mm) {
/*
* oom_kill_process() needs tasklist_lock held. If it returns
* non-zero, current could not be killed so we must fallback to
* the tasklist scan.
*/
if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
NULL, nodemask,
"Out of memory (oom_kill_allocating_task)"))
goto out;
}
retry:
p = select_bad_process(&points, totalpages, NULL, mpol_mask);
if (PTR_ERR(p) == -1UL)
oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
nodemask,
"Out of memory (oom_kill_allocating_task)");
goto out;
}
p = select_bad_process(&points, totalpages, NULL, mpol_mask,
force_kill);
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
read_unlock(&tasklist_lock);
panic("Out of memory and no killable processes...\n");
}
if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
nodemask, "Out of memory"))
goto retry;
killed = 1;
if (PTR_ERR(p) != -1UL) {
oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
nodemask, "Out of memory");
killed = 1;
}
out:
read_unlock(&tasklist_lock);
......@@ -792,7 +774,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
void pagefault_out_of_memory(void)
{
if (try_set_system_oom()) {
out_of_memory(NULL, 0, 0, NULL);
out_of_memory(NULL, 0, 0, NULL, false);
clear_system_oom();
}
if (!test_thread_flag(TIF_MEMDIE))
......
......@@ -1472,6 +1472,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
for ( ; ; ) {
global_dirty_limits(&background_thresh, &dirty_thresh);
dirty_thresh = hard_dirty_limit(dirty_thresh);
/*
* Boost the allowable dirty threshold a bit for page
......
......@@ -1968,7 +1968,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
goto out;
}
/* Exhausted what can be done so it's blamo time */
out_of_memory(zonelist, gfp_mask, order, nodemask);
out_of_memory(zonelist, gfp_mask, order, nodemask, false);
out:
clear_zonelist_oom(zonelist, gfp_mask);
......@@ -1990,7 +1990,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
if (!order)
return NULL;
if (compaction_deferred(preferred_zone)) {
if (compaction_deferred(preferred_zone, order)) {
*deferred_compaction = true;
return NULL;
}
......@@ -2012,6 +2012,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
if (page) {
preferred_zone->compact_considered = 0;
preferred_zone->compact_defer_shift = 0;
if (order >= preferred_zone->compact_order_failed)
preferred_zone->compact_order_failed = order + 1;
count_vm_event(COMPACTSUCCESS);
return page;
}
......@@ -2028,7 +2030,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
* defer if the failure was a sync compaction failure.
*/
if (sync_migration)
defer_compaction(preferred_zone);
defer_compaction(preferred_zone, order);
cond_resched();
}
......@@ -2378,8 +2380,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
struct zone *preferred_zone;
struct page *page;
struct page *page = NULL;
int migratetype = allocflags_to_migratetype(gfp_mask);
unsigned int cpuset_mems_cookie;
gfp_mask &= gfp_allowed_mask;
......@@ -2398,15 +2401,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
get_mems_allowed();
retry_cpuset:
cpuset_mems_cookie = get_mems_allowed();
/* The preferred zone is used for statistics later */
first_zones_zonelist(zonelist, high_zoneidx,
nodemask ? : &cpuset_current_mems_allowed,
&preferred_zone);
if (!preferred_zone) {
put_mems_allowed();
return NULL;
}
if (!preferred_zone)
goto out;
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
......@@ -2416,9 +2419,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
put_mems_allowed();
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
out:
/*
* When updating a task's mems_allowed, it is possible to race with
* parallel threads in such a way that an allocation can fail while
* the mask is being updated. If a page allocation is about to fail,
* check if the cpuset changed during allocation and if so, retry.
*/
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
return page;
}
EXPORT_SYMBOL(__alloc_pages_nodemask);
......@@ -2632,13 +2645,15 @@ void si_meminfo_node(struct sysinfo *val, int nid)
bool skip_free_areas_node(unsigned int flags, int nid)
{
bool ret = false;
unsigned int cpuset_mems_cookie;
if (!(flags & SHOW_MEM_FILTER_NODES))
goto out;
get_mems_allowed();
ret = !node_isset(nid, cpuset_current_mems_allowed);
put_mems_allowed();
do {
cpuset_mems_cookie = get_mems_allowed();
ret = !node_isset(nid, cpuset_current_mems_allowed);
} while (!put_mems_allowed(cpuset_mems_cookie));
out:
return ret;
}
......@@ -3925,18 +3940,6 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
}
}
int __init add_from_early_node_map(struct range *range, int az,
int nr_range, int nid)
{
unsigned long start_pfn, end_pfn;
int i;
/* need to go over early_node_map to find out good range for node */
for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL)
nr_range = add_range(range, az, nr_range, start_pfn, end_pfn);
return nr_range;
}
/**
* sparse_memory_present_with_active_regions - Call memory_present for each active range
* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
......@@ -4521,7 +4524,7 @@ static unsigned long __init early_calculate_totalpages(void)
* memory. When they don't, some nodes will have more kernelcore than
* others
*/
static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
static void __init find_zone_movable_pfns_for_nodes(void)
{
int i, nid;
unsigned long usable_startpfn;
......@@ -4713,7 +4716,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
find_zone_movable_pfns_for_nodes(zone_movable_pfn);
find_zone_movable_pfns_for_nodes();
/* Print out the zone ranges */
printk("Zone PFN ranges:\n");
......@@ -4823,6 +4826,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
int cpu = (unsigned long)hcpu;
if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
lru_add_drain_cpu(cpu);
drain_pages(cpu);
/*
......
......@@ -59,7 +59,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
continue;
split_huge_page_pmd(walk->mm, pmd);
if (pmd_none_or_clear_bad(pmd))
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
goto again;
err = walk_pte_range(pmd, addr, next, walk);
if (err)
......
......@@ -70,10 +70,11 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
{
int young;
#ifndef CONFIG_TRANSPARENT_HUGEPAGE
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
#else
BUG();
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
young = pmdp_test_and_clear_young(vma, address, pmdp);
if (young)
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
......
此差异已折叠。
此差异已折叠。
......@@ -3284,12 +3284,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
if (in_interrupt() || (flags & __GFP_THISNODE))
return NULL;
nid_alloc = nid_here = numa_mem_id();
get_mems_allowed();
if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
nid_alloc = cpuset_slab_spread_node();
else if (current->mempolicy)
nid_alloc = slab_node(current->mempolicy);
put_mems_allowed();
if (nid_alloc != nid_here)
return ____cache_alloc_node(cachep, flags, nid_alloc);
return NULL;
......@@ -3312,14 +3310,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
enum zone_type high_zoneidx = gfp_zone(flags);
void *obj = NULL;
int nid;
unsigned int cpuset_mems_cookie;
if (flags & __GFP_THISNODE)
return NULL;
get_mems_allowed();
zonelist = node_zonelist(slab_node(current->mempolicy), flags);
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
retry_cpuset:
cpuset_mems_cookie = get_mems_allowed();
zonelist = node_zonelist(slab_node(current->mempolicy), flags);
retry:
/*
* Look through allowed nodes for objects available
......@@ -3372,7 +3373,9 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
}
}
}
put_mems_allowed();
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
goto retry_cpuset;
return obj;
}
......
此差异已折叠。
此差异已折叠。
......@@ -496,7 +496,7 @@ static void lru_deactivate_fn(struct page *page, void *arg)
* Either "cpu" is the current CPU, and preemption has already been
* disabled; or "cpu" is being hot-unplugged, and is already dead.
*/
static void drain_cpu_pagevecs(int cpu)
void lru_add_drain_cpu(int cpu)
{
struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
struct pagevec *pvec;
......@@ -553,7 +553,7 @@ void deactivate_page(struct page *page)
void lru_add_drain(void)
{
drain_cpu_pagevecs(get_cpu());
lru_add_drain_cpu(get_cpu());
put_cpu();
}
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册
反馈
建议
客服 返回
顶部