提交 8b9ea901 编写于 作者: Y Yunfeng Ye 提交者: Yang Yingliang

pagecache: support percpu refcount to imporve performance

euleros inclusion
category: feature
feature: pagecache percpu refcount
bugzilla: 31398
CVE: NA

-------------------------------------------------

The pagecache manages the file physical pages, and the life cycle of
page is managed by atomic counting. With the increasing number of cpu
cores, the cost of atomic counting is very large when reading file
pagecaches at large concurrent.

For example, when running nginx http application, the biggest hotspot is
found in the atomic operation of find_get_entry():

 11.94% [kernel] [k] find_get_entry
  7.45% [kernel] [k] do_tcp_sendpages
  6.12% [kernel] [k] generic_file_buffered_read

So we using the percpu refcount mechanism to fix this problem. and the
test result show that the read performance of nginx http can be improved
by 100%:

  worker   original(requests/sec)   percpu(requests/sec)   imporve
  64       759656.87                1627088.95             114.2%

Notes: we use page->lru to save percpu information, so the pages with
percpu attribute will not be recycled by memory recycling process, we
should avoid grow the file size unlimited.
Signed-off-by: NYunfeng Ye <yeyunfeng@huawei.com>
Reviewed-by: NKefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: NYang Yingliang <yangyingliang@huawei.com>
上级 57457a65
......@@ -29,6 +29,7 @@
#include <linux/poll.h>
#include <asm/siginfo.h>
#include <linux/uaccess.h>
#include <linux/pagemap.h>
#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
......@@ -319,6 +320,22 @@ static long fcntl_rw_hint(struct file *file, unsigned int cmd,
}
}
static long fcntl_mapping_percpu(struct file *filp, unsigned int cmd,
unsigned long arg)
{
struct address_space *mapping = filp->f_mapping;
unsigned long flag = arg;
if (!mapping)
return -EINVAL;
if (flag)
mapping_set_percpu_ref(mapping);
else
mapping_clear_percpu_ref(mapping);
return 0;
}
static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
struct file *filp)
{
......@@ -426,6 +443,9 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
case F_SET_FILE_RW_HINT:
err = fcntl_rw_hint(filp, cmd, arg);
break;
case F_MAPPING_PERCPU:
err = fcntl_mapping_percpu(filp, cmd, arg);
break;
default:
break;
}
......
......@@ -528,6 +528,10 @@ static inline int pgd_devmap(pgd_t pgd)
static inline int put_page_testzero(struct page *page)
{
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
if (PagePercpuRef(page)) {
percpu_ref_put(page_percpu_ref(page));
return 0;
}
return page_ref_dec_and_test(page);
}
......@@ -539,6 +543,10 @@ static inline int put_page_testzero(struct page *page)
*/
static inline int get_page_unless_zero(struct page *page)
{
if (PagePercpuRef(page)) {
percpu_ref_get(page_percpu_ref(page));
return true;
}
return page_ref_add_unless(page, 1, 0);
}
......@@ -928,6 +936,11 @@ static inline bool is_device_public_page(const struct page *page)
static inline void get_page(struct page *page)
{
page = compound_head(page);
if (PagePercpuRef(page)) {
percpu_ref_get(page_percpu_ref(page));
return;
}
/*
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_refcount.
......@@ -939,6 +952,11 @@ static inline void get_page(struct page *page)
static inline __must_check bool try_get_page(struct page *page)
{
page = compound_head(page);
if (PagePercpuRef(page)) {
percpu_ref_get(page_percpu_ref(page));
return true;
}
if (WARN_ON_ONCE(page_ref_count(page) <= 0))
return false;
page_ref_inc(page);
......@@ -949,6 +967,11 @@ static inline void put_page(struct page *page)
{
page = compound_head(page);
if (PagePercpuRef(page)) {
percpu_ref_put(page_percpu_ref(page));
return;
}
/*
* For devmap managed pages we need to catch refcount transition from
* 2 to 1, when refcount reach one it means the page is free and we
......
......@@ -101,6 +101,7 @@ enum pageflags {
PG_young,
PG_idle,
#endif
PG_percpu_ref,
__NR_PAGEFLAGS,
/* Filesystems */
......@@ -385,6 +386,7 @@ SETPAGEFLAG(Young, young, PF_ANY)
TESTCLEARFLAG(Young, young, PF_ANY)
PAGEFLAG(Idle, idle, PF_ANY)
#endif
PAGEFLAG(PercpuRef, percpu_ref, PF_ANY)
/*
* On an anonymous page mapped into a user virtual memory area,
......
......@@ -180,4 +180,15 @@ static inline void page_ref_unfreeze(struct page *page, int count)
__page_ref_unfreeze(page, count);
}
static inline struct percpu_ref *page_percpu_ref(struct page *page)
{
return *(struct percpu_ref **)&page->lru;
}
static inline void page_set_percpu_ref(struct page *page,
struct percpu_ref *ref)
{
*(struct percpu_ref **)&page->lru = ref;
}
#endif
......@@ -29,6 +29,7 @@ enum mapping_flags {
AS_EXITING = 4, /* final truncate in progress */
/* writeback related tags are not used */
AS_NO_WRITEBACK_TAGS = 5,
AS_PERCPU_REF = 6, /* percpu ref counter for special inode */
};
/**
......@@ -97,6 +98,21 @@ static inline int mapping_use_writeback_tags(struct address_space *mapping)
return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}
static inline void mapping_set_percpu_ref(struct address_space *mapping)
{
set_bit(AS_PERCPU_REF, &mapping->flags);
}
static inline void mapping_clear_percpu_ref(struct address_space *mapping)
{
clear_bit(AS_PERCPU_REF, &mapping->flags);
}
static inline int mapping_percpu_ref(struct address_space *mapping)
{
return test_bit(AS_PERCPU_REF, &mapping->flags);
}
static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
{
return mapping->gfp_mask;
......@@ -170,6 +186,10 @@ static inline int page_cache_get_speculative(struct page *page)
# ifdef CONFIG_PREEMPT_COUNT
VM_BUG_ON(!in_atomic() && !irqs_disabled());
# endif
if (PagePercpuRef(page)) {
percpu_ref_get(page_percpu_ref(page));
return 1;
}
/*
* Preempt must be disabled here - we rely on rcu_read_lock doing
* this for us.
......@@ -183,6 +203,10 @@ static inline int page_cache_get_speculative(struct page *page)
page_ref_inc(page);
#else
if (PagePercpuRef(page)) {
percpu_ref_get(page_percpu_ref(page));
return 1;
}
if (unlikely(!get_page_unless_zero(page))) {
/*
* Either the page has been freed, or will be freed.
......
......@@ -104,7 +104,8 @@ IF_HAVE_PG_MLOCK(PG_mlocked, "mlocked" ) \
IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \
IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \
IF_HAVE_PG_IDLE(PG_young, "young" ) \
IF_HAVE_PG_IDLE(PG_idle, "idle" )
IF_HAVE_PG_IDLE(PG_idle, "idle"), \
{1UL << PG_percpu_ref, "percpu_ref" }
#define show_page_flags(flags) \
(flags) ? __print_flags(flags, "|", \
......
......@@ -53,6 +53,8 @@
#define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13)
#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14)
#define F_MAPPING_PERCPU (F_LINUX_SPECIFIC_BASE + 15)
/*
* Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
* used to clear any hints previously set.
......
......@@ -48,6 +48,66 @@
#include <asm/mman.h>
struct percpu_page {
struct percpu_ref ref;
struct page *page;
};
static void free_page_ref(struct percpu_ref *ref)
{
struct percpu_page *p = (struct percpu_page *)ref;
struct page *page = p->page;
percpu_ref_exit(ref);
kfree(page_percpu_ref(page));
page_set_percpu_ref(page, NULL);
ClearPagePercpuRef(page);
/* really free the page */
put_page(page);
}
static void page_cache_init(struct address_space *mapping, struct page *page)
{
struct percpu_page *p;
if (!mapping_percpu_ref(mapping))
return;
p = kzalloc(sizeof(struct percpu_page), GFP_KERNEL);
if (!p)
return;
if (percpu_ref_init(&p->ref, free_page_ref, 0, GFP_KERNEL))
goto err;
p->page = page;
page_set_percpu_ref(page, &p->ref);
SetPagePercpuRef(page);
get_page(page);
return;
err:
kfree(p);
}
static void page_cache_exit(struct page *page)
{
if (!PagePercpuRef(page))
return;
put_page(page);
ClearPagePercpuRef(page);
percpu_ref_exit(page_percpu_ref(page));
kfree(page_percpu_ref(page));
page_set_percpu_ref(page, NULL);
}
static void page_cache_kill(struct page *page)
{
if (!PagePercpuRef(page))
return;
percpu_ref_kill(page_percpu_ref(page));
}
/*
* Shared mappings implemented 30.11.1994. It's not fully working yet,
* though.
......@@ -264,6 +324,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)
unaccount_page_cache_page(mapping, page);
page_cache_tree_delete(mapping, page, shadow);
page_cache_kill(page);
}
static void page_cache_free_page(struct address_space *mapping,
......@@ -384,8 +445,10 @@ void delete_from_page_cache_batch(struct address_space *mapping,
page_cache_tree_delete_batch(mapping, pvec);
xa_unlock_irqrestore(&mapping->i_pages, flags);
for (i = 0; i < pagevec_count(pvec); i++)
for (i = 0; i < pagevec_count(pvec); i++) {
page_cache_kill(pvec->pages[i]);
page_cache_free_page(mapping, pvec->pages[i]);
}
}
int filemap_check_errors(struct address_space *mapping)
......@@ -966,7 +1029,8 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
workingset_activation(page);
} else
ClearPageActive(page);
lru_cache_add(page);
if (!PagePercpuRef(page))
lru_cache_add(page);
}
return ret;
}
......@@ -1630,8 +1694,10 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
if (fgp_flags & FGP_ACCESSED)
__SetPageReferenced(page);
page_cache_init(mapping, page);
err = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
if (unlikely(err)) {
page_cache_exit(page);
put_page(page);
page = NULL;
if (err == -EEXIST)
......@@ -2320,9 +2386,11 @@ static ssize_t generic_file_buffered_read(struct kiocb *iocb,
error = -ENOMEM;
goto out;
}
page_cache_init(mapping, page);
error = add_to_page_cache_lru(page, mapping, index,
mapping_gfp_constraint(mapping, GFP_KERNEL));
if (error) {
page_cache_exit(page);
put_page(page);
if (error == -EEXIST) {
error = 0;
......@@ -2837,8 +2905,10 @@ static struct page *do_read_cache_page(struct address_space *mapping,
page = __page_cache_alloc(gfp);
if (!page)
return ERR_PTR(-ENOMEM);
page_cache_init(mapping, page);
err = add_to_page_cache_lru(page, mapping, index, gfp);
if (unlikely(err)) {
page_cache_exit(page);
put_page(page);
if (err == -EEXIST)
goto repeat;
......
......@@ -372,6 +372,8 @@ static void __lru_cache_activate_page(struct page *page)
void mark_page_accessed(struct page *page)
{
page = compound_head(page);
if (PagePercpuRef(page))
return;
if (!PageActive(page) && !PageUnevictable(page) &&
PageReferenced(page)) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册