提交 3cb29d11 编写于 作者: V Vladimir Davydov 提交者: Linus Torvalds

cleancache: remove limit on the number of cleancache enabled filesystems

The limit equals 32 and is imposed by the number of entries in the
fs_poolid_map and shared_fs_poolid_map.  Nowadays it is insufficient,
because with containers on board a Linux host can have hundreds of
active fs mounts.

These maps were introduced by commit 49a9ab81 ("mm: cleancache:
lazy initialization to allow tmem backends to build/run as modules") in
order to allow compiling cleancache drivers as modules.  Real pool ids
are stored in these maps while super_block->cleancache_poolid points to
an entry in the map, so that on cleancache registration we can walk over
all (if there are <= 32 of them, of course) cleancache-enabled super
blocks and assign real pool ids.

Actually, there is absolutely no need in these maps, because we can
iterate over all super blocks immediately using iterate_supers.  This is
not racy, because cleancache_init_ops is called from mount_fs with
super_block->s_umount held for writing, while iterate_supers takes this
semaphore for reading, so if we call iterate_supers after setting
cleancache_ops, all super blocks that had been created before
cleancache_register_ops was called will be assigned pool ids by the
action function of iterate_supers while all newer super blocks will
receive it in cleancache_init_fs.

This patch therefore removes the maps and hence the artificial limit on
the number of cleancache enabled filesystems.
Signed-off-by: NVladimir Davydov <vdavydov@parallels.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Stefan Hengelein <ilendir@googlemail.com>
Cc: Florian Schmaus <fschmaus@gmail.com>
Cc: Andor Daam <andor.daam@googlemail.com>
Cc: Dan Magenheimer <dan.magenheimer@oracle.com>
Cc: Bob Liu <lliubbo@gmail.com>
Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
上级 53d85c98
master alk-4.19.24 alk-4.19.30 alk-4.19.34 alk-4.19.36 alk-4.19.43 alk-4.19.48 alk-4.19.57 ck-4.19.67 ck-4.19.81 ck-4.19.91 github/fork/deepanshu1422/fix-typo-in-comment github/fork/haosdent/fix-typo linux-next v4.19.91 v4.19.90 v4.19.89 v4.19.88 v4.19.87 v4.19.86 v4.19.85 v4.19.84 v4.19.83 v4.19.82 v4.19.81 v4.19.80 v4.19.79 v4.19.78 v4.19.77 v4.19.76 v4.19.75 v4.19.74 v4.19.73 v4.19.72 v4.19.71 v4.19.70 v4.19.69 v4.19.68 v4.19.67 v4.19.66 v4.19.65 v4.19.64 v4.19.63 v4.19.62 v4.19.61 v4.19.60 v4.19.59 v4.19.58 v4.19.57 v4.19.56 v4.19.55 v4.19.54 v4.19.53 v4.19.52 v4.19.51 v4.19.50 v4.19.49 v4.19.48 v4.19.47 v4.19.46 v4.19.45 v4.19.44 v4.19.43 v4.19.42 v4.19.41 v4.19.40 v4.19.39 v4.19.38 v4.19.37 v4.19.36 v4.19.35 v4.19.34 v4.19.33 v4.19.32 v4.19.31 v4.19.30 v4.19.29 v4.19.28 v4.19.27 v4.19.26 v4.19.25 v4.19.24 v4.19.23 v4.19.22 v4.19.21 v4.19.20 v4.19.19 v4.19.18 v4.19.17 v4.19.16 v4.19.15 v4.19.14 v4.19.13 v4.19.12 v4.19.11 v4.19.10 v4.19.9 v4.19.8 v4.19.7 v4.19.6 v4.19.5 v4.19.4 v4.19.3 v4.19.2 v4.19.1 v4.19 v4.19-rc8 v4.19-rc7 v4.19-rc6 v4.19-rc5 v4.19-rc4 v4.19-rc3 v4.19-rc2 v4.19-rc1 ck-release-21 ck-release-20 ck-release-19.2 ck-release-19.1 ck-release-19 ck-release-18 ck-release-17.2 ck-release-17.1 ck-release-17 ck-release-16 ck-release-15.1 ck-release-15 ck-release-14 ck-release-13.2 ck-release-13 ck-release-12 ck-release-11 ck-release-10 ck-release-9 ck-release-7 alk-release-15 alk-release-14 alk-release-13.2 alk-release-13 alk-release-12 alk-release-11 alk-release-10 alk-release-9 alk-release-7
无相关合并请求
......@@ -224,7 +224,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
s->s_maxbytes = MAX_NON_LFS;
s->s_op = &default_op;
s->s_time_gran = 1000000000;
s->cleancache_poolid = -1;
s->cleancache_poolid = CLEANCACHE_NO_POOL;
s->s_shrink.seeks = DEFAULT_SEEKS;
s->s_shrink.scan_objects = super_cache_scan;
......
......@@ -5,6 +5,10 @@
#include <linux/exportfs.h>
#include <linux/mm.h>
#define CLEANCACHE_NO_POOL -1
#define CLEANCACHE_NO_BACKEND -2
#define CLEANCACHE_NO_BACKEND_SHARED -3
#define CLEANCACHE_KEY_MAX 6
/*
......
......@@ -19,7 +19,7 @@
#include <linux/cleancache.h>
/*
* cleancache_ops is set by cleancache_ops_register to contain the pointers
* cleancache_ops is set by cleancache_register_ops to contain the pointers
* to the cleancache "backend" implementation functions.
*/
static struct cleancache_ops *cleancache_ops __read_mostly;
......@@ -34,104 +34,78 @@ static u64 cleancache_failed_gets;
static u64 cleancache_puts;
static u64 cleancache_invalidates;
/*
* When no backend is registered all calls to init_fs and init_shared_fs
* are registered and fake poolids (FAKE_FS_POOLID_OFFSET or
* FAKE_SHARED_FS_POOLID_OFFSET, plus offset in the respective array
* [shared_|]fs_poolid_map) are given to the respective super block
* (sb->cleancache_poolid) and no tmem_pools are created. When a backend
* registers with cleancache the previous calls to init_fs and init_shared_fs
* are executed to create tmem_pools and set the respective poolids. While no
* backend is registered all "puts", "gets" and "flushes" are ignored or failed.
*/
#define MAX_INITIALIZABLE_FS 32
#define FAKE_FS_POOLID_OFFSET 1000
#define FAKE_SHARED_FS_POOLID_OFFSET 2000
#define FS_NO_BACKEND (-1)
#define FS_UNKNOWN (-2)
static int fs_poolid_map[MAX_INITIALIZABLE_FS];
static int shared_fs_poolid_map[MAX_INITIALIZABLE_FS];
static char *uuids[MAX_INITIALIZABLE_FS];
/*
* Mutex for the [shared_|]fs_poolid_map to guard against multiple threads
* invoking umount (and ending in __cleancache_invalidate_fs) and also multiple
* threads calling mount (and ending up in __cleancache_init_[shared|]fs).
*/
static DEFINE_MUTEX(poolid_mutex);
/*
* When set to false (default) all calls to the cleancache functions, except
* the __cleancache_invalidate_fs and __cleancache_init_[shared|]fs are guarded
* by the if (!cleancache_ops) return. This means multiple threads (from
* different filesystems) will be checking cleancache_ops. The usage of a
* bool instead of a atomic_t or a bool guarded by a spinlock is OK - we are
* OK if the time between the backend's have been initialized (and
* cleancache_ops has been set to not NULL) and when the filesystems start
* actually calling the backends. The inverse (when unloading) is obviously
* not good - but this shim does not do that (yet).
*/
/*
* The backends and filesystems work all asynchronously. This is b/c the
* backends can be built as modules.
* The usual sequence of events is:
* a) mount / -> __cleancache_init_fs is called. We set the
* [shared_|]fs_poolid_map and uuids for.
*
* b). user does I/Os -> we call the rest of __cleancache_* functions
* which return immediately as cleancache_ops is false.
*
* c). modprobe zcache -> cleancache_register_ops. We init the backend
* and set cleancache_ops to true, and for any fs_poolid_map
* (which is set by __cleancache_init_fs) we initialize the poolid.
*
* d). user does I/Os -> now that cleancache_ops is true all the
* __cleancache_* functions can call the backend. They all check
* that fs_poolid_map is valid and if so invoke the backend.
*
* e). umount / -> __cleancache_invalidate_fs, the fs_poolid_map is
* reset (which is the second check in the __cleancache_* ops
* to call the backend).
*
* The sequence of event could also be c), followed by a), and d). and e). The
* c) would not happen anymore. There is also the chance of c), and one thread
* doing a) + d), and another doing e). For that case we depend on the
* filesystem calling __cleancache_invalidate_fs in the proper sequence (so
* that it handles all I/Os before it invalidates the fs (which is last part
* of unmounting process).
*
* Note: The acute reader will notice that there is no "rmmod zcache" case.
* This is b/c the functionality for that is not yet implemented and when
* done, will require some extra locking not yet devised.
*/
static void cleancache_register_ops_sb(struct super_block *sb, void *unused)
{
switch (sb->cleancache_poolid) {
case CLEANCACHE_NO_BACKEND:
__cleancache_init_fs(sb);
break;
case CLEANCACHE_NO_BACKEND_SHARED:
__cleancache_init_shared_fs(sb);
break;
}
}
/*
* Register operations for cleancache. Returns 0 on success.
*/
int cleancache_register_ops(struct cleancache_ops *ops)
{
int i;
mutex_lock(&poolid_mutex);
if (cleancache_ops) {
mutex_unlock(&poolid_mutex);
if (cmpxchg(&cleancache_ops, NULL, ops))
return -EBUSY;
}
for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
if (fs_poolid_map[i] == FS_NO_BACKEND)
fs_poolid_map[i] = ops->init_fs(PAGE_SIZE);
if (shared_fs_poolid_map[i] == FS_NO_BACKEND)
shared_fs_poolid_map[i] = ops->init_shared_fs
(uuids[i], PAGE_SIZE);
}
/*
* We MUST set cleancache_ops _after_ we have called the backends
* init_fs or init_shared_fs functions. Otherwise the compiler might
* re-order where cleancache_ops is set in this function.
* A cleancache backend can be built as a module and hence loaded after
* a cleancache enabled filesystem has called cleancache_init_fs. To
* handle such a scenario, here we call ->init_fs or ->init_shared_fs
* for each active super block. To differentiate between local and
* shared filesystems, we temporarily initialize sb->cleancache_poolid
* to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED
* respectively in case there is no backend registered at the time
* cleancache_init_fs or cleancache_init_shared_fs is called.
*
* Since filesystems can be mounted concurrently with cleancache
* backend registration, we have to be careful to guarantee that all
* cleancache enabled filesystems that has been mounted by the time
* cleancache_register_ops is called has got and all mounted later will
* get cleancache_poolid. This is assured by the following statements
* tied together:
*
* a) iterate_supers skips only those super blocks that has started
* ->kill_sb
*
* b) if iterate_supers encounters a super block that has not finished
* ->mount yet, it waits until it is finished
*
* c) cleancache_init_fs is called from ->mount and
* cleancache_invalidate_fs is called from ->kill_sb
*
* d) we call iterate_supers after cleancache_ops has been set
*
* From a) it follows that if iterate_supers skips a super block, then
* either the super block is already dead, in which case we do not need
* to bother initializing cleancache for it, or it was mounted after we
* initiated iterate_supers. In the latter case, it must have seen
* cleancache_ops set according to d) and initialized cleancache from
* ->mount by itself according to c). This proves that we call
* ->init_fs at least once for each active super block.
*
* From b) and c) it follows that if iterate_supers encounters a super
* block that has already started ->init_fs, it will wait until ->mount
* and hence ->init_fs has finished, then check cleancache_poolid, see
* that it has already been set and therefore do nothing. This proves
* that we call ->init_fs no more than once for each super block.
*
* Combined together, the last two paragraphs prove the function
* correctness.
*
* Note that various cleancache callbacks may proceed before this
* function is called or even concurrently with it, but since
* CLEANCACHE_NO_BACKEND is negative, they will all result in a noop
* until the corresponding ->init_fs has been actually called and
* cleancache_ops has been set.
*/
barrier();
cleancache_ops = ops;
mutex_unlock(&poolid_mutex);
iterate_supers(cleancache_register_ops_sb, NULL);
return 0;
}
EXPORT_SYMBOL(cleancache_register_ops);
......@@ -139,42 +113,28 @@ EXPORT_SYMBOL(cleancache_register_ops);
/* Called by a cleancache-enabled filesystem at time of mount */
void __cleancache_init_fs(struct super_block *sb)
{
int i;
int pool_id = CLEANCACHE_NO_BACKEND;
mutex_lock(&poolid_mutex);
for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
if (fs_poolid_map[i] == FS_UNKNOWN) {
sb->cleancache_poolid = i + FAKE_FS_POOLID_OFFSET;
if (cleancache_ops)
fs_poolid_map[i] = cleancache_ops->init_fs(PAGE_SIZE);
else
fs_poolid_map[i] = FS_NO_BACKEND;
break;
}
if (cleancache_ops) {
pool_id = cleancache_ops->init_fs(PAGE_SIZE);
if (pool_id < 0)
pool_id = CLEANCACHE_NO_POOL;
}
mutex_unlock(&poolid_mutex);
sb->cleancache_poolid = pool_id;
}
EXPORT_SYMBOL(__cleancache_init_fs);
/* Called by a cleancache-enabled clustered filesystem at time of mount */
void __cleancache_init_shared_fs(struct super_block *sb)
{
int i;
int pool_id = CLEANCACHE_NO_BACKEND_SHARED;
mutex_lock(&poolid_mutex);
for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
if (shared_fs_poolid_map[i] == FS_UNKNOWN) {
sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET;
uuids[i] = sb->s_uuid;
if (cleancache_ops)
shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs
(sb->s_uuid, PAGE_SIZE);
else
shared_fs_poolid_map[i] = FS_NO_BACKEND;
break;
}
if (cleancache_ops) {
pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE);
if (pool_id < 0)
pool_id = CLEANCACHE_NO_POOL;
}
mutex_unlock(&poolid_mutex);
sb->cleancache_poolid = pool_id;
}
EXPORT_SYMBOL(__cleancache_init_shared_fs);
......@@ -203,19 +163,6 @@ static int cleancache_get_key(struct inode *inode,
return 0;
}
/*
* Returns a pool_id that is associated with a given fake poolid.
*/
static int get_poolid_from_fake(int fake_pool_id)
{
if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET)
return shared_fs_poolid_map[fake_pool_id -
FAKE_SHARED_FS_POOLID_OFFSET];
else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET)
return fs_poolid_map[fake_pool_id - FAKE_FS_POOLID_OFFSET];
return FS_NO_BACKEND;
}
/*
* "Get" data from cleancache associated with the poolid/inode/index
* that were specified when the data was put to cleanache and, if
......@@ -231,7 +178,6 @@ int __cleancache_get_page(struct page *page)
{
int ret = -1;
int pool_id;
int fake_pool_id;
struct cleancache_filekey key = { .u.key = { 0 } };
if (!cleancache_ops) {
......@@ -240,17 +186,14 @@ int __cleancache_get_page(struct page *page)
}
VM_BUG_ON_PAGE(!PageLocked(page), page);
fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
if (fake_pool_id < 0)
pool_id = page->mapping->host->i_sb->cleancache_poolid;
if (pool_id < 0)
goto out;
pool_id = get_poolid_from_fake(fake_pool_id);
if (cleancache_get_key(page->mapping->host, &key) < 0)
goto out;
if (pool_id >= 0)
ret = cleancache_ops->get_page(pool_id,
key, page->index, page);
ret = cleancache_ops->get_page(pool_id, key, page->index, page);
if (ret == 0)
cleancache_succ_gets++;
else
......@@ -273,7 +216,6 @@ EXPORT_SYMBOL(__cleancache_get_page);
void __cleancache_put_page(struct page *page)
{
int pool_id;
int fake_pool_id;
struct cleancache_filekey key = { .u.key = { 0 } };
if (!cleancache_ops) {
......@@ -282,12 +224,7 @@ void __cleancache_put_page(struct page *page)
}
VM_BUG_ON_PAGE(!PageLocked(page), page);
fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
if (fake_pool_id < 0)
return;
pool_id = get_poolid_from_fake(fake_pool_id);
pool_id = page->mapping->host->i_sb->cleancache_poolid;
if (pool_id >= 0 &&
cleancache_get_key(page->mapping->host, &key) >= 0) {
cleancache_ops->put_page(pool_id, key, page->index, page);
......@@ -308,18 +245,13 @@ void __cleancache_invalidate_page(struct address_space *mapping,
struct page *page)
{
/* careful... page->mapping is NULL sometimes when this is called */
int pool_id;
int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
int pool_id = mapping->host->i_sb->cleancache_poolid;
struct cleancache_filekey key = { .u.key = { 0 } };
if (!cleancache_ops)
return;
if (fake_pool_id >= 0) {
pool_id = get_poolid_from_fake(fake_pool_id);
if (pool_id < 0)
return;
if (pool_id >= 0) {
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (cleancache_get_key(mapping->host, &key) >= 0) {
cleancache_ops->invalidate_page(pool_id,
......@@ -341,18 +273,12 @@ EXPORT_SYMBOL(__cleancache_invalidate_page);
*/
void __cleancache_invalidate_inode(struct address_space *mapping)
{
int pool_id;
int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
int pool_id = mapping->host->i_sb->cleancache_poolid;
struct cleancache_filekey key = { .u.key = { 0 } };
if (!cleancache_ops)
return;
if (fake_pool_id < 0)
return;
pool_id = get_poolid_from_fake(fake_pool_id);
if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
cleancache_ops->invalidate_inode(pool_id, key);
}
......@@ -365,32 +291,18 @@ EXPORT_SYMBOL(__cleancache_invalidate_inode);
*/
void __cleancache_invalidate_fs(struct super_block *sb)
{
int index;
int fake_pool_id = sb->cleancache_poolid;
int old_poolid = fake_pool_id;
int pool_id;
mutex_lock(&poolid_mutex);
if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) {
index = fake_pool_id - FAKE_SHARED_FS_POOLID_OFFSET;
old_poolid = shared_fs_poolid_map[index];
shared_fs_poolid_map[index] = FS_UNKNOWN;
uuids[index] = NULL;
} else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) {
index = fake_pool_id - FAKE_FS_POOLID_OFFSET;
old_poolid = fs_poolid_map[index];
fs_poolid_map[index] = FS_UNKNOWN;
}
sb->cleancache_poolid = -1;
if (cleancache_ops)
cleancache_ops->invalidate_fs(old_poolid);
mutex_unlock(&poolid_mutex);
pool_id = sb->cleancache_poolid;
sb->cleancache_poolid = CLEANCACHE_NO_POOL;
if (cleancache_ops && pool_id >= 0)
cleancache_ops->invalidate_fs(pool_id);
}
EXPORT_SYMBOL(__cleancache_invalidate_fs);
static int __init init_cleancache(void)
{
int i;
#ifdef CONFIG_DEBUG_FS
struct dentry *root = debugfs_create_dir("cleancache", NULL);
if (root == NULL)
......@@ -402,10 +314,6 @@ static int __init init_cleancache(void)
debugfs_create_u64("invalidates", S_IRUGO,
root, &cleancache_invalidates);
#endif
for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
fs_poolid_map[i] = FS_UNKNOWN;
shared_fs_poolid_map[i] = FS_UNKNOWN;
}
return 0;
}
module_init(init_cleancache)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册
反馈
建议
客服 返回
顶部