diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 10ce0c825fcebd5908349cbd65f810bd43fcb995..c1c44191afb1a633fb42be7f23efaa083b92570e 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -735,8 +735,8 @@ struct cache_set { * basically a lock for this that we can wait on asynchronously. The * btree_root() macro releases the lock when it returns. */ - struct closure *try_harder; - struct closure_waitlist try_wait; + struct task_struct *try_harder; + wait_queue_head_t try_wait; uint64_t try_harder_start; /* diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 731cd8e3fe902773d70729017f774d6efc7f74d1..4d50f1e7006ef011e12587366f54a1a448afb2ae 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -437,7 +437,7 @@ static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) set_btree_node_dirty(b); - if (op && op->journal) { + if (op->journal) { if (w->journal && journal_pin_cmp(b->c, w, op)) { atomic_dec_bug(w->journal); @@ -574,34 +574,35 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, return b; } -static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) +static int mca_reap(struct btree *b, unsigned min_order, bool flush) { + struct closure cl; + + closure_init_stack(&cl); lockdep_assert_held(&b->c->bucket_lock); if (!down_write_trylock(&b->lock)) return -ENOMEM; - if (b->page_order < min_order) { + BUG_ON(btree_node_dirty(b) && !b->sets[0].data); + + if (b->page_order < min_order || + (!flush && + (btree_node_dirty(b) || + atomic_read(&b->io.cl.remaining) != -1))) { rw_unlock(true, b); return -ENOMEM; } - BUG_ON(btree_node_dirty(b) && !b->sets[0].data); - - if (cl && btree_node_dirty(b)) - bch_btree_node_write(b, NULL); - - if (cl) - closure_wait_event_async(&b->io.wait, cl, - atomic_read(&b->io.cl.remaining) == -1); - - if (btree_node_dirty(b) || - !closure_is_unlocked(&b->io.cl) || - work_pending(&b->work.work)) { - rw_unlock(true, b); - return -EAGAIN; + if (btree_node_dirty(b)) { + bch_btree_node_write(b, &cl); + closure_sync(&cl); } + /* wait for any in flight btree write */ + closure_wait_event_sync(&b->io.wait, &cl, + atomic_read(&b->io.cl.remaining) == -1); + return 0; } @@ -641,7 +642,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, break; if (++i > 3 && - !mca_reap(b, NULL, 0)) { + !mca_reap(b, 0, false)) { mca_data_free(b); rw_unlock(true, b); freed++; @@ -660,7 +661,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, list_rotate_left(&c->btree_cache); if (!b->accessed && - !mca_reap(b, NULL, 0)) { + !mca_reap(b, 0, false)) { mca_bucket_free(b); mca_data_free(b); rw_unlock(true, b); @@ -783,52 +784,27 @@ static struct btree *mca_find(struct cache_set *c, struct bkey *k) return b; } -static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, - int level, struct closure *cl) +static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) { - int ret = -ENOMEM; - struct btree *i; + struct btree *b; trace_bcache_btree_cache_cannibalize(c); - if (!cl) - return ERR_PTR(-ENOMEM); - - /* - * Trying to free up some memory - i.e. reuse some btree nodes - may - * require initiating IO to flush the dirty part of the node. If we're - * running under generic_make_request(), that IO will never finish and - * we would deadlock. Returning -EAGAIN causes the cache lookup code to - * punt to workqueue and retry. - */ - if (current->bio_list) - return ERR_PTR(-EAGAIN); - - if (c->try_harder && c->try_harder != cl) { - closure_wait_event_async(&c->try_wait, cl, !c->try_harder); - return ERR_PTR(-EAGAIN); - } + if (!c->try_harder) { + c->try_harder = current; + c->try_harder_start = local_clock(); + } else if (c->try_harder != current) + return ERR_PTR(-ENOSPC); - c->try_harder = cl; - c->try_harder_start = local_clock(); -retry: - list_for_each_entry_reverse(i, &c->btree_cache, list) { - int r = mca_reap(i, cl, btree_order(k)); - if (!r) - return i; - if (r != -ENOMEM) - ret = r; - } + list_for_each_entry_reverse(b, &c->btree_cache, list) + if (!mca_reap(b, btree_order(k), false)) + return b; - if (ret == -EAGAIN && - closure_blocking(cl)) { - mutex_unlock(&c->bucket_lock); - closure_sync(cl); - mutex_lock(&c->bucket_lock); - goto retry; - } + list_for_each_entry_reverse(b, &c->btree_cache, list) + if (!mca_reap(b, btree_order(k), true)) + return b; - return ERR_PTR(ret); + return ERR_PTR(-ENOMEM); } /* @@ -839,18 +815,19 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, */ void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) { - if (c->try_harder == cl) { + if (c->try_harder == current) { bch_time_stats_update(&c->try_harder_time, c->try_harder_start); c->try_harder = NULL; - __closure_wake_up(&c->try_wait); + wake_up(&c->try_wait); } } -static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, - int level, struct closure *cl) +static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) { struct btree *b; + BUG_ON(current->bio_list); + lockdep_assert_held(&c->bucket_lock); if (mca_find(c, k)) @@ -860,14 +837,14 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, * the list. Check if there's any freed nodes there: */ list_for_each_entry(b, &c->btree_cache_freeable, list) - if (!mca_reap(b, NULL, btree_order(k))) + if (!mca_reap(b, btree_order(k), false)) goto out; /* We never free struct btree itself, just the memory that holds the on * disk node. Check the freed list before allocating a new one: */ list_for_each_entry(b, &c->btree_cache_freed, list) - if (!mca_reap(b, NULL, 0)) { + if (!mca_reap(b, 0, false)) { mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); if (!b->sets[0].data) goto err; @@ -901,7 +878,7 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, if (b) rw_unlock(true, b); - b = mca_cannibalize(c, k, level, cl); + b = mca_cannibalize(c, k); if (!IS_ERR(b)) goto out; @@ -919,10 +896,9 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, * level and op->lock. */ struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, - int level, struct btree_op *op) + int level, bool write) { int i = 0; - bool write = level <= op->lock; struct btree *b; BUG_ON(level < 0); @@ -934,7 +910,7 @@ struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, return ERR_PTR(-EAGAIN); mutex_lock(&c->bucket_lock); - b = mca_alloc(c, k, level, &op->cl); + b = mca_alloc(c, k, level); mutex_unlock(&c->bucket_lock); if (!b) @@ -980,7 +956,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) struct btree *b; mutex_lock(&c->bucket_lock); - b = mca_alloc(c, k, level, NULL); + b = mca_alloc(c, k, level); mutex_unlock(&c->bucket_lock); if (!IS_ERR_OR_NULL(b)) { @@ -991,17 +967,12 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) /* Btree alloc */ -static void btree_node_free(struct btree *b, struct btree_op *op) +static void btree_node_free(struct btree *b) { unsigned i; trace_bcache_btree_node_free(b); - /* - * The BUG_ON() in btree_node_get() implies that we must have a write - * lock on parent to free or even invalidate a node - */ - BUG_ON(op->lock <= b->level); BUG_ON(b == b->c->root); if (btree_node_dirty(b)) @@ -1037,7 +1008,7 @@ struct btree *bch_btree_node_alloc(struct cache_set *c, int level, SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); - b = mca_alloc(c, &k.key, level, cl); + b = mca_alloc(c, &k.key, level); if (IS_ERR(b)) goto err_free; @@ -1173,8 +1144,7 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys, return stale; } -static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, - struct btree_op *op) +static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k) { /* * We block priorities from being written for the duration of garbage @@ -1191,7 +1161,7 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, memcpy(k->ptr, b->key.ptr, sizeof(uint64_t) * KEY_PTRS(&b->key)); - btree_node_free(n, op); + btree_node_free(n); up_write(&n->lock); } @@ -1211,8 +1181,8 @@ struct gc_merge_info { unsigned keys; }; -static void btree_gc_coalesce(struct btree *b, struct btree_op *op, - struct gc_stat *gc, struct gc_merge_info *r) +static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc, + struct gc_merge_info *r) { unsigned nodes = 0, keys = 0, blocks; int i; @@ -1228,7 +1198,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op, for (i = nodes - 1; i >= 0; --i) { if (r[i].b->written) - r[i].b = btree_gc_alloc(r[i].b, r[i].k, op); + r[i].b = btree_gc_alloc(r[i].b, r[i].k); if (r[i].b->written) return; @@ -1292,7 +1262,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op, r[i - 1].keys = n2->keys; } - btree_node_free(r->b, op); + btree_node_free(r->b); up_write(&r->b->lock); trace_bcache_btree_gc_coalesce(nodes); @@ -1324,7 +1294,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, memset(r, 0, sizeof(r)); while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { - r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op); + r->b = bch_btree_node_get(b->c, r->k, b->level - 1, true); if (IS_ERR(r->b)) { ret = PTR_ERR(r->b); @@ -1337,7 +1307,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, if (!b->written && (r->b->level || stale > 10 || b->c->gc_always_rewrite)) - r->b = btree_gc_alloc(r->b, r->k, op); + r->b = btree_gc_alloc(r->b, r->k); if (r->b->level) ret = btree_gc_recurse(r->b, op, writes, gc); @@ -1350,7 +1320,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, bkey_copy_key(&b->c->gc_done, r->k); if (!b->written) - btree_gc_coalesce(b, op, gc, r); + btree_gc_coalesce(b, gc, r); if (r[GC_MERGE_NODES - 1].b) write(r[GC_MERGE_NODES - 1].b); @@ -1404,7 +1374,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, if (!IS_ERR_OR_NULL(n)) { closure_sync(&op->cl); bch_btree_set_root(b); - btree_node_free(n, op); + btree_node_free(n); rw_unlock(true, b); } @@ -2004,18 +1974,18 @@ static int btree_split(struct btree *b, struct btree_op *op, } rw_unlock(true, n1); - btree_node_free(b, op); + btree_node_free(b); bch_time_stats_update(&b->c->btree_split_time, start_time); return 0; err_free2: __bkey_put(n2->c, &n2->key); - btree_node_free(n2, op); + btree_node_free(n2); rw_unlock(true, n2); err_free1: __bkey_put(n1->c, &n1->key); - btree_node_free(n1, op); + btree_node_free(n1); rw_unlock(true, n1); err: if (n3 == ERR_PTR(-EAGAIN) || diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 17b7a4e39c7e07df5b9966a663b3e9834619191c..72794ab8e8e5038a40faa2ade443a64915269699 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -326,7 +326,7 @@ static inline void rw_unlock(bool w, struct btree *b) ({ \ int _r, l = (b)->level - 1; \ bool _w = l <= (op)->lock; \ - struct btree *_child = bch_btree_node_get((b)->c, key, l, op); \ + struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \ if (!IS_ERR(_child)) { \ _child->parent = (b); \ _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ @@ -356,6 +356,11 @@ static inline void rw_unlock(bool w, struct btree *b) } \ rw_unlock(_w, _b); \ bch_cannibalize_unlock(c, &(op)->cl); \ + if (_r == -ENOSPC) { \ + wait_event((c)->try_wait, \ + !(c)->try_harder); \ + _r = -EINTR; \ + } \ } while (_r == -EINTR); \ \ _r; \ @@ -375,8 +380,7 @@ void bch_btree_node_write(struct btree *, struct closure *); void bch_cannibalize_unlock(struct cache_set *, struct closure *); void bch_btree_set_root(struct btree *); struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *); -struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, - int, struct btree_op *); +struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool); int bch_btree_insert_check_key(struct btree *, struct btree_op *, struct bkey *); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index d3169c0652f8cd67e1ae1a478eb687c157ed8ae3..9a164cd4058c94925e9955e863eaef66e791f1e4 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1436,12 +1436,14 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->sort_crit_factor = int_sqrt(c->btree_pages); - mutex_init(&c->bucket_lock); - mutex_init(&c->sort_lock); - spin_lock_init(&c->sort_time_lock); closure_init_unlocked(&c->sb_write); + mutex_init(&c->bucket_lock); + init_waitqueue_head(&c->try_wait); closure_init_unlocked(&c->uuid_write); + spin_lock_init(&c->sort_time_lock); + mutex_init(&c->sort_lock); spin_lock_init(&c->btree_read_time_lock); + bch_moving_init_cache_set(c); INIT_LIST_HEAD(&c->list); @@ -1529,7 +1531,7 @@ static void run_cache_set(struct cache_set *c) goto err; err = "error reading btree root"; - c->root = bch_btree_node_get(c, k, j->btree_level, &op); + c->root = bch_btree_node_get(c, k, j->btree_level, true); if (IS_ERR_OR_NULL(c->root)) goto err;