diff --git a/fs/dcache.c b/fs/dcache.c index 41e2085d430bd3aa16c53d608335ffc1aee487fd..2762804a140db03cb432648a1b38d910f24f640d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -743,13 +743,11 @@ static void shrink_dentry_list(struct list_head *list) * * If flags contains DCACHE_REFERENCED reference dentries will not be pruned. */ -static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags) +static void __shrink_dcache_sb(struct super_block *sb, int count, int flags) { - /* called from prune_dcache() and shrink_dcache_parent() */ struct dentry *dentry; LIST_HEAD(referenced); LIST_HEAD(tmp); - int cnt = *count; relock: spin_lock(&dcache_lru_lock); @@ -777,7 +775,7 @@ static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags) } else { list_move_tail(&dentry->d_lru, &tmp); spin_unlock(&dentry->d_lock); - if (!--cnt) + if (!--count) break; } cond_resched_lock(&dcache_lru_lock); @@ -787,83 +785,22 @@ static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags) spin_unlock(&dcache_lru_lock); shrink_dentry_list(&tmp); - - *count = cnt; } /** - * prune_dcache - shrink the dcache - * @count: number of entries to try to free + * prune_dcache_sb - shrink the dcache + * @nr_to_scan: number of entries to try to free * - * Shrink the dcache. This is done when we need more memory, or simply when we - * need to unmount something (at which point we need to unuse all dentries). + * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is + * done when we need more memory an called from the superblock shrinker + * function. * - * This function may fail to free any resources if all the dentries are in use. + * This function may fail to free any resources if all the dentries are in + * use. */ -static void prune_dcache(int count) +void prune_dcache_sb(struct super_block *sb, int nr_to_scan) { - struct super_block *sb, *p = NULL; - int w_count; - int unused = dentry_stat.nr_unused; - int prune_ratio; - int pruned; - - if (unused == 0 || count == 0) - return; - if (count >= unused) - prune_ratio = 1; - else - prune_ratio = unused / count; - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - if (list_empty(&sb->s_instances)) - continue; - if (sb->s_nr_dentry_unused == 0) - continue; - sb->s_count++; - /* Now, we reclaim unused dentrins with fairness. - * We reclaim them same percentage from each superblock. - * We calculate number of dentries to scan on this sb - * as follows, but the implementation is arranged to avoid - * overflows: - * number of dentries to scan on this sb = - * count * (number of dentries on this sb / - * number of dentries in the machine) - */ - spin_unlock(&sb_lock); - if (prune_ratio != 1) - w_count = (sb->s_nr_dentry_unused / prune_ratio) + 1; - else - w_count = sb->s_nr_dentry_unused; - pruned = w_count; - /* - * We need to be sure this filesystem isn't being unmounted, - * otherwise we could race with generic_shutdown_super(), and - * end up holding a reference to an inode while the filesystem - * is unmounted. So we try to get s_umount, and make sure - * s_root isn't NULL. - */ - if (down_read_trylock(&sb->s_umount)) { - if ((sb->s_root != NULL) && - (!list_empty(&sb->s_dentry_lru))) { - __shrink_dcache_sb(sb, &w_count, - DCACHE_REFERENCED); - pruned -= w_count; - } - up_read(&sb->s_umount); - } - spin_lock(&sb_lock); - if (p) - __put_super(p); - count -= pruned; - p = sb; - /* more work left to do? */ - if (count <= 0) - break; - } - if (p) - __put_super(p); - spin_unlock(&sb_lock); + __shrink_dcache_sb(sb, nr_to_scan, DCACHE_REFERENCED); } /** @@ -1238,42 +1175,10 @@ void shrink_dcache_parent(struct dentry * parent) int found; while ((found = select_parent(parent)) != 0) - __shrink_dcache_sb(sb, &found, 0); + __shrink_dcache_sb(sb, found, 0); } EXPORT_SYMBOL(shrink_dcache_parent); -/* - * Scan `sc->nr_slab_to_reclaim' dentries and return the number which remain. - * - * We need to avoid reentering the filesystem if the caller is performing a - * GFP_NOFS allocation attempt. One example deadlock is: - * - * ext2_new_block->getblk->GFP->shrink_dcache_memory->prune_dcache-> - * prune_one_dentry->dput->dentry_iput->iput->inode->i_sb->s_op->put_inode-> - * ext2_discard_prealloc->ext2_free_blocks->lock_super->DEADLOCK. - * - * In this case we return -1 to tell the caller that we baled. - */ -static int shrink_dcache_memory(struct shrinker *shrink, - struct shrink_control *sc) -{ - int nr = sc->nr_to_scan; - gfp_t gfp_mask = sc->gfp_mask; - - if (nr) { - if (!(gfp_mask & __GFP_FS)) - return -1; - prune_dcache(nr); - } - - return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; -} - -static struct shrinker dcache_shrinker = { - .shrink = shrink_dcache_memory, - .seeks = DEFAULT_SEEKS, -}; - /** * __d_alloc - allocate a dcache entry * @sb: filesystem it will belong to @@ -3083,8 +2988,6 @@ static void __init dcache_init(void) */ dentry_cache = KMEM_CACHE(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); - - register_shrinker(&dcache_shrinker); /* Hash may have been set up in dcache_init_early */ if (!hashdist) diff --git a/fs/inode.c b/fs/inode.c index 0450e25aeda0887e78afbde55f0da9958c800ca1..1fdbb64a952f3fefde204d964cf141fe5c284085 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -73,7 +73,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); * * We don't actually need it to protect anything in the umount path, * but only need to cycle through it to make sure any inode that - * prune_icache took off the LRU list has been fully torn down by the + * prune_icache_sb took off the LRU list has been fully torn down by the * time we are past evict_inodes. */ static DECLARE_RWSEM(iprune_sem); @@ -544,7 +544,7 @@ void evict_inodes(struct super_block *sb) dispose_list(&dispose); /* - * Cycle through iprune_sem to make sure any inode that prune_icache + * Cycle through iprune_sem to make sure any inode that prune_icache_sb * moved off the list before we took the lock has been fully torn * down. */ @@ -612,9 +612,10 @@ static int can_unuse(struct inode *inode) } /* - * Scan `goal' inodes on the unused list for freeable ones. They are moved to a - * temporary list and then are freed outside sb->s_inode_lru_lock by - * dispose_list(). + * Walk the superblock inode LRU for freeable inodes and attempt to free them. + * This is called from the superblock shrinker function with a number of inodes + * to trim from the LRU. Inodes to be freed are moved to a temporary list and + * then are freed outside inode_lock by dispose_list(). * * Any inodes which are pinned purely because of attached pagecache have their * pagecache removed. If the inode has metadata buffers attached to @@ -628,14 +629,15 @@ static int can_unuse(struct inode *inode) * LRU does not have strict ordering. Hence we don't want to reclaim inodes * with this flag set because they are the inodes that are out of order. */ -static void shrink_icache_sb(struct super_block *sb, int *nr_to_scan) +void prune_icache_sb(struct super_block *sb, int nr_to_scan) { LIST_HEAD(freeable); int nr_scanned; unsigned long reap = 0; + down_read(&iprune_sem); spin_lock(&sb->s_inode_lru_lock); - for (nr_scanned = *nr_to_scan; nr_scanned >= 0; nr_scanned--) { + for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) { struct inode *inode; if (list_empty(&sb->s_inode_lru)) @@ -707,111 +709,11 @@ static void shrink_icache_sb(struct super_block *sb, int *nr_to_scan) else __count_vm_events(PGINODESTEAL, reap); spin_unlock(&sb->s_inode_lru_lock); - *nr_to_scan = nr_scanned; dispose_list(&freeable); -} - -static void prune_icache(int count) -{ - struct super_block *sb, *p = NULL; - int w_count; - int unused = inodes_stat.nr_unused; - int prune_ratio; - int pruned; - - if (unused == 0 || count == 0) - return; - down_read(&iprune_sem); - if (count >= unused) - prune_ratio = 1; - else - prune_ratio = unused / count; - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - if (list_empty(&sb->s_instances)) - continue; - if (sb->s_nr_inodes_unused == 0) - continue; - sb->s_count++; - /* Now, we reclaim unused dentrins with fairness. - * We reclaim them same percentage from each superblock. - * We calculate number of dentries to scan on this sb - * as follows, but the implementation is arranged to avoid - * overflows: - * number of dentries to scan on this sb = - * count * (number of dentries on this sb / - * number of dentries in the machine) - */ - spin_unlock(&sb_lock); - if (prune_ratio != 1) - w_count = (sb->s_nr_inodes_unused / prune_ratio) + 1; - else - w_count = sb->s_nr_inodes_unused; - pruned = w_count; - /* - * We need to be sure this filesystem isn't being unmounted, - * otherwise we could race with generic_shutdown_super(), and - * end up holding a reference to an inode while the filesystem - * is unmounted. So we try to get s_umount, and make sure - * s_root isn't NULL. - */ - if (down_read_trylock(&sb->s_umount)) { - if ((sb->s_root != NULL) && - (!list_empty(&sb->s_dentry_lru))) { - shrink_icache_sb(sb, &w_count); - pruned -= w_count; - } - up_read(&sb->s_umount); - } - spin_lock(&sb_lock); - if (p) - __put_super(p); - count -= pruned; - p = sb; - /* more work left to do? */ - if (count <= 0) - break; - } - if (p) - __put_super(p); - spin_unlock(&sb_lock); up_read(&iprune_sem); } -/* - * shrink_icache_memory() will attempt to reclaim some unused inodes. Here, - * "unused" means that no dentries are referring to the inodes: the files are - * not open and the dcache references to those inodes have already been - * reclaimed. - * - * This function is passed the number of inodes to scan, and it returns the - * total number of remaining possibly-reclaimable inodes. - */ -static int shrink_icache_memory(struct shrinker *shrink, - struct shrink_control *sc) -{ - int nr = sc->nr_to_scan; - gfp_t gfp_mask = sc->gfp_mask; - - if (nr) { - /* - * Nasty deadlock avoidance. We may hold various FS locks, - * and we don't want to recurse into the FS that called us - * in clear_inode() and friends.. - */ - if (!(gfp_mask & __GFP_FS)) - return -1; - prune_icache(nr); - } - return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure; -} - -static struct shrinker icache_shrinker = { - .shrink = shrink_icache_memory, - .seeks = DEFAULT_SEEKS, -}; - static void __wait_on_freeing_inode(struct inode *inode); /* * Called with the inode lock held. @@ -1691,7 +1593,6 @@ void __init inode_init(void) (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| SLAB_MEM_SPREAD), init_once); - register_shrinker(&icache_shrinker); /* Hash may have been set up in inode_init_early */ if (!hashdist) diff --git a/fs/super.c b/fs/super.c index e63c754447ce05431655b0026195c3ec33e00c3a..37a75410079ea92116f9053b1c8dcaba88b441e8 100644 --- a/fs/super.c +++ b/fs/super.c @@ -38,6 +38,48 @@ LIST_HEAD(super_blocks); DEFINE_SPINLOCK(sb_lock); +/* + * One thing we have to be careful of with a per-sb shrinker is that we don't + * drop the last active reference to the superblock from within the shrinker. + * If that happens we could trigger unregistering the shrinker from within the + * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we + * take a passive reference to the superblock to avoid this from occurring. + */ +static int prune_super(struct shrinker *shrink, struct shrink_control *sc) +{ + struct super_block *sb; + int count; + + sb = container_of(shrink, struct super_block, s_shrink); + + /* + * Deadlock avoidance. We may hold various FS locks, and we don't want + * to recurse into the FS that called us in clear_inode() and friends.. + */ + if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS)) + return -1; + + if (!grab_super_passive(sb)) + return -1; + + if (sc->nr_to_scan) { + /* proportion the scan between the two caches */ + int total; + + total = sb->s_nr_dentry_unused + sb->s_nr_inodes_unused + 1; + count = (sc->nr_to_scan * sb->s_nr_dentry_unused) / total; + + /* prune dcache first as icache is pinned by it */ + prune_dcache_sb(sb, count); + prune_icache_sb(sb, sc->nr_to_scan - count); + } + + count = ((sb->s_nr_dentry_unused + sb->s_nr_inodes_unused) / 100) + * sysctl_vfs_cache_pressure; + drop_super(sb); + return count; +} + /** * alloc_super - create new superblock * @type: filesystem type superblock should belong to @@ -116,6 +158,9 @@ static struct super_block *alloc_super(struct file_system_type *type) s->s_op = &default_op; s->s_time_gran = 1000000000; s->cleancache_poolid = -1; + + s->s_shrink.seeks = DEFAULT_SEEKS; + s->s_shrink.shrink = prune_super; } out: return s; @@ -183,6 +228,10 @@ void deactivate_locked_super(struct super_block *s) if (atomic_dec_and_test(&s->s_active)) { cleancache_flush_fs(s); fs->kill_sb(s); + + /* caches are now gone, we can safely kill the shrinker now */ + unregister_shrinker(&s->s_shrink); + /* * We need to call rcu_barrier so all the delayed rcu free * inodes are flushed before we release the fs module. @@ -311,7 +360,6 @@ void generic_shutdown_super(struct super_block *sb) { const struct super_operations *sop = sb->s_op; - if (sb->s_root) { shrink_dcache_for_umount(sb); sync_filesystem(sb); @@ -399,6 +447,7 @@ struct super_block *sget(struct file_system_type *type, list_add(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); get_filesystem(type); + register_shrinker(&s->s_shrink); return s; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 460d2cc21ec60031c346b687c197114dfb8d8221..d7f35e90b84a0c2bd244e4c28ad7468929d6e2fe 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -393,6 +393,7 @@ struct inodes_stat_t { #include #include #include +#include #include #include @@ -1444,8 +1445,14 @@ struct super_block { * Saved pool identifier for cleancache (-1 means none) */ int cleancache_poolid; + + struct shrinker s_shrink; /* per-sb shrinker handle */ }; +/* superblock cache pruning functions */ +extern void prune_icache_sb(struct super_block *sb, int nr_to_scan); +extern void prune_dcache_sb(struct super_block *sb, int nr_to_scan); + extern struct timespec current_fs_time(struct super_block *sb); /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 9b9777ac726d0b8ae72bfd054452964e494eb79e..e3a1a9eec0b1e8f97eda164405fe2789fc7d7f62 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -15,6 +15,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -1121,45 +1122,6 @@ static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) } #endif -/* - * This struct is used to pass information from page reclaim to the shrinkers. - * We consolidate the values for easier extention later. - */ -struct shrink_control { - gfp_t gfp_mask; - - /* How many slab objects shrinker() should scan and try to reclaim */ - unsigned long nr_to_scan; -}; - -/* - * A callback you can register to apply pressure to ageable caches. - * - * 'sc' is passed shrink_control which includes a count 'nr_to_scan' - * and a 'gfpmask'. It should look through the least-recently-used - * 'nr_to_scan' entries and attempt to free them up. It should return - * the number of objects which remain in the cache. If it returns -1, it means - * it cannot do any scanning at this time (eg. there is a risk of deadlock). - * - * The 'gfpmask' refers to the allocation we are currently trying to - * fulfil. - * - * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is - * querying the cache size, so a fastpath for that case is appropriate. - */ -struct shrinker { - int (*shrink)(struct shrinker *, struct shrink_control *sc); - int seeks; /* seeks to recreate an obj */ - long batch; /* reclaim batch size, 0 = default */ - - /* These are for internal use */ - struct list_head list; - long nr; /* objs pending delete */ -}; -#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ -extern void register_shrinker(struct shrinker *); -extern void unregister_shrinker(struct shrinker *); - int vma_wants_writenotify(struct vm_area_struct *vma); extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h new file mode 100644 index 0000000000000000000000000000000000000000..790651b4e5baf6ea9625cd186065019e8133784e --- /dev/null +++ b/include/linux/shrinker.h @@ -0,0 +1,42 @@ +#ifndef _LINUX_SHRINKER_H +#define _LINUX_SHRINKER_H + +/* + * This struct is used to pass information from page reclaim to the shrinkers. + * We consolidate the values for easier extention later. + */ +struct shrink_control { + gfp_t gfp_mask; + + /* How many slab objects shrinker() should scan and try to reclaim */ + unsigned long nr_to_scan; +}; + +/* + * A callback you can register to apply pressure to ageable caches. + * + * 'sc' is passed shrink_control which includes a count 'nr_to_scan' + * and a 'gfpmask'. It should look through the least-recently-used + * 'nr_to_scan' entries and attempt to free them up. It should return + * the number of objects which remain in the cache. If it returns -1, it means + * it cannot do any scanning at this time (eg. there is a risk of deadlock). + * + * The 'gfpmask' refers to the allocation we are currently trying to + * fulfil. + * + * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is + * querying the cache size, so a fastpath for that case is appropriate. + */ +struct shrinker { + int (*shrink)(struct shrinker *, struct shrink_control *sc); + int seeks; /* seeks to recreate an obj */ + long batch; /* reclaim batch size, 0 = default */ + + /* These are for internal use */ + struct list_head list; + long nr; /* objs pending delete */ +}; +#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ +extern void register_shrinker(struct shrinker *); +extern void unregister_shrinker(struct shrinker *); +#endif