提交 34e55232 编写于 作者: K KAMEZAWA Hiroyuki 提交者: Linus Torvalds

mm: avoid false sharing of mm_counter

Considering the nature of per mm stats, it's the shared object among
threads and can be a cache-miss point in the page fault path.

This patch adds per-thread cache for mm_counter.  RSS value will be
counted into a struct in task_struct and synchronized with mm's one at
events.

Now, in this patch, the event is the number of calls to handle_mm_fault.
Per-thread value is added to mm at each 64 calls.

 rough estimation with small benchmark on parallel thread (2threads) shows
 [before]
     4.5 cache-miss/faults
 [after]
     4.0 cache-miss/faults
 Anyway, the most contended object is mmap_sem if the number of threads grows.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: NKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
上级 d559db08
...@@ -188,6 +188,12 @@ memory usage. Its seven fields are explained in Table 1-3. The stat file ...@@ -188,6 +188,12 @@ memory usage. Its seven fields are explained in Table 1-3. The stat file
contains details information about the process itself. Its fields are contains details information about the process itself. Its fields are
explained in Table 1-4. explained in Table 1-4.
(for SMP CONFIG users)
For making accounting scalable, RSS related information are handled in
asynchronous manner and the vaule may not be very precise. To see a precise
snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table.
It's slow but very precise.
Table 1-2: Contents of the statm files (as of 2.6.30-rc7) Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
.............................................................................. ..............................................................................
Field Content Field Content
......
...@@ -718,6 +718,7 @@ static int exec_mmap(struct mm_struct *mm) ...@@ -718,6 +718,7 @@ static int exec_mmap(struct mm_struct *mm)
/* Notify parent that we're no longer interested in the old VM */ /* Notify parent that we're no longer interested in the old VM */
tsk = current; tsk = current;
old_mm = current->mm; old_mm = current->mm;
sync_mm_rss(tsk, old_mm);
mm_release(tsk, old_mm); mm_release(tsk, old_mm);
if (old_mm) { if (old_mm) {
......
...@@ -873,7 +873,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, ...@@ -873,7 +873,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
/* /*
* per-process(per-mm_struct) statistics. * per-process(per-mm_struct) statistics.
*/ */
#if USE_SPLIT_PTLOCKS #if defined(SPLIT_RSS_COUNTING)
/* /*
* The mm counters are not protected by its page_table_lock, * The mm counters are not protected by its page_table_lock,
* so must be incremented atomically. * so must be incremented atomically.
...@@ -883,10 +883,7 @@ static inline void set_mm_counter(struct mm_struct *mm, int member, long value) ...@@ -883,10 +883,7 @@ static inline void set_mm_counter(struct mm_struct *mm, int member, long value)
atomic_long_set(&mm->rss_stat.count[member], value); atomic_long_set(&mm->rss_stat.count[member], value);
} }
static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) unsigned long get_mm_counter(struct mm_struct *mm, int member);
{
return (unsigned long)atomic_long_read(&mm->rss_stat.count[member]);
}
static inline void add_mm_counter(struct mm_struct *mm, int member, long value) static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
{ {
...@@ -974,6 +971,7 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, ...@@ -974,6 +971,7 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
*maxrss = hiwater_rss; *maxrss = hiwater_rss;
} }
void sync_mm_rss(struct task_struct *task, struct mm_struct *mm);
/* /*
* A callback you can register to apply pressure to ageable caches. * A callback you can register to apply pressure to ageable caches.
......
...@@ -202,9 +202,15 @@ enum { ...@@ -202,9 +202,15 @@ enum {
}; };
#if USE_SPLIT_PTLOCKS #if USE_SPLIT_PTLOCKS
#define SPLIT_RSS_COUNTING
struct mm_rss_stat { struct mm_rss_stat {
atomic_long_t count[NR_MM_COUNTERS]; atomic_long_t count[NR_MM_COUNTERS];
}; };
/* per-thread cached information, */
struct task_rss_stat {
int events; /* for synchronization threshold */
int count[NR_MM_COUNTERS];
};
#else /* !USE_SPLIT_PTLOCKS */ #else /* !USE_SPLIT_PTLOCKS */
struct mm_rss_stat { struct mm_rss_stat {
unsigned long count[NR_MM_COUNTERS]; unsigned long count[NR_MM_COUNTERS];
......
...@@ -1220,7 +1220,9 @@ struct task_struct { ...@@ -1220,7 +1220,9 @@ struct task_struct {
struct plist_node pushable_tasks; struct plist_node pushable_tasks;
struct mm_struct *mm, *active_mm; struct mm_struct *mm, *active_mm;
#if defined(SPLIT_RSS_COUNTING)
struct task_rss_stat rss_stat;
#endif
/* task state */ /* task state */
int exit_state; int exit_state;
int exit_code, exit_signal; int exit_code, exit_signal;
......
...@@ -952,7 +952,8 @@ NORET_TYPE void do_exit(long code) ...@@ -952,7 +952,8 @@ NORET_TYPE void do_exit(long code)
preempt_count()); preempt_count());
acct_update_integrals(tsk); acct_update_integrals(tsk);
/* sync mm's RSS info before statistics gathering */
sync_mm_rss(tsk, tsk->mm);
group_dead = atomic_dec_and_test(&tsk->signal->live); group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) { if (group_dead) {
hrtimer_cancel(&tsk->signal->real_timer); hrtimer_cancel(&tsk->signal->real_timer);
......
...@@ -122,6 +122,79 @@ static int __init init_zero_pfn(void) ...@@ -122,6 +122,79 @@ static int __init init_zero_pfn(void)
core_initcall(init_zero_pfn); core_initcall(init_zero_pfn);
#if defined(SPLIT_RSS_COUNTING)
void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
{
int i;
for (i = 0; i < NR_MM_COUNTERS; i++) {
if (task->rss_stat.count[i]) {
add_mm_counter(mm, i, task->rss_stat.count[i]);
task->rss_stat.count[i] = 0;
}
}
task->rss_stat.events = 0;
}
static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
{
struct task_struct *task = current;
if (likely(task->mm == mm))
task->rss_stat.count[member] += val;
else
add_mm_counter(mm, member, val);
}
#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
/* sync counter once per 64 page faults */
#define TASK_RSS_EVENTS_THRESH (64)
static void check_sync_rss_stat(struct task_struct *task)
{
if (unlikely(task != current))
return;
if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
__sync_task_rss_stat(task, task->mm);
}
unsigned long get_mm_counter(struct mm_struct *mm, int member)
{
long val = 0;
/*
* Don't use task->mm here...for avoiding to use task_get_mm()..
* The caller must guarantee task->mm is not invalid.
*/
val = atomic_long_read(&mm->rss_stat.count[member]);
/*
* counter is updated in asynchronous manner and may go to minus.
* But it's never be expected number for users.
*/
if (val < 0)
return 0;
return (unsigned long)val;
}
void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
{
__sync_task_rss_stat(task, mm);
}
#else
#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
static void check_sync_rss_stat(struct task_struct *task)
{
}
void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
{
}
#endif
/* /*
* If a p?d_bad entry is found while walking page tables, report * If a p?d_bad entry is found while walking page tables, report
* the error, before resetting entry to p?d_none. Usually (but * the error, before resetting entry to p?d_none. Usually (but
...@@ -386,6 +459,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) ...@@ -386,6 +459,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
{ {
int i; int i;
if (current->mm == mm)
sync_mm_rss(current, mm);
for (i = 0; i < NR_MM_COUNTERS; i++) for (i = 0; i < NR_MM_COUNTERS; i++)
if (rss[i]) if (rss[i])
add_mm_counter(mm, i, rss[i]); add_mm_counter(mm, i, rss[i]);
...@@ -1539,7 +1614,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, ...@@ -1539,7 +1614,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
/* Ok, finally just insert the thing.. */ /* Ok, finally just insert the thing.. */
get_page(page); get_page(page);
inc_mm_counter(mm, MM_FILEPAGES); inc_mm_counter_fast(mm, MM_FILEPAGES);
page_add_file_rmap(page); page_add_file_rmap(page);
set_pte_at(mm, addr, pte, mk_pte(page, prot)); set_pte_at(mm, addr, pte, mk_pte(page, prot));
...@@ -2175,11 +2250,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -2175,11 +2250,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (likely(pte_same(*page_table, orig_pte))) { if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) { if (old_page) {
if (!PageAnon(old_page)) { if (!PageAnon(old_page)) {
dec_mm_counter(mm, MM_FILEPAGES); dec_mm_counter_fast(mm, MM_FILEPAGES);
inc_mm_counter(mm, MM_ANONPAGES); inc_mm_counter_fast(mm, MM_ANONPAGES);
} }
} else } else
inc_mm_counter(mm, MM_ANONPAGES); inc_mm_counter_fast(mm, MM_ANONPAGES);
flush_cache_page(vma, address, pte_pfn(orig_pte)); flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot); entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma); entry = maybe_mkwrite(pte_mkdirty(entry), vma);
...@@ -2616,7 +2691,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -2616,7 +2691,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
* discarded at swap_free(). * discarded at swap_free().
*/ */
inc_mm_counter(mm, MM_ANONPAGES); inc_mm_counter_fast(mm, MM_ANONPAGES);
pte = mk_pte(page, vma->vm_page_prot); pte = mk_pte(page, vma->vm_page_prot);
if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma); pte = maybe_mkwrite(pte_mkdirty(pte), vma);
...@@ -2700,7 +2775,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -2700,7 +2775,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pte_none(*page_table)) if (!pte_none(*page_table))
goto release; goto release;
inc_mm_counter(mm, MM_ANONPAGES); inc_mm_counter_fast(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, address); page_add_new_anon_rmap(page, vma, address);
setpte: setpte:
set_pte_at(mm, address, page_table, entry); set_pte_at(mm, address, page_table, entry);
...@@ -2854,10 +2929,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -2854,10 +2929,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (flags & FAULT_FLAG_WRITE) if (flags & FAULT_FLAG_WRITE)
entry = maybe_mkwrite(pte_mkdirty(entry), vma); entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (anon) { if (anon) {
inc_mm_counter(mm, MM_ANONPAGES); inc_mm_counter_fast(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, address); page_add_new_anon_rmap(page, vma, address);
} else { } else {
inc_mm_counter(mm, MM_FILEPAGES); inc_mm_counter_fast(mm, MM_FILEPAGES);
page_add_file_rmap(page); page_add_file_rmap(page);
if (flags & FAULT_FLAG_WRITE) { if (flags & FAULT_FLAG_WRITE) {
dirty_page = page; dirty_page = page;
...@@ -3035,6 +3110,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -3035,6 +3110,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(PGFAULT); count_vm_event(PGFAULT);
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
if (unlikely(is_vm_hugetlb_page(vma))) if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags); return hugetlb_fault(mm, vma, address, flags);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册