btree.h 12.7 KB
Newer Older
K
Kent Overstreet 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
#ifndef _BCACHE_BTREE_H
#define _BCACHE_BTREE_H

/*
 * THE BTREE:
 *
 * At a high level, bcache's btree is relatively standard b+ tree. All keys and
 * pointers are in the leaves; interior nodes only have pointers to the child
 * nodes.
 *
 * In the interior nodes, a struct bkey always points to a child btree node, and
 * the key is the highest key in the child node - except that the highest key in
 * an interior node is always MAX_KEY. The size field refers to the size on disk
 * of the child node - this would allow us to have variable sized btree nodes
 * (handy for keeping the depth of the btree 1 by expanding just the root).
 *
 * Btree nodes are themselves log structured, but this is hidden fairly
 * thoroughly. Btree nodes on disk will in practice have extents that overlap
 * (because they were written at different times), but in memory we never have
 * overlapping extents - when we read in a btree node from disk, the first thing
 * we do is resort all the sets of keys with a mergesort, and in the same pass
 * we check for overlapping extents and adjust them appropriately.
 *
 * struct btree_op is a central interface to the btree code. It's used for
 * specifying read vs. write locking, and the embedded closure is used for
 * waiting on IO or reserve memory.
 *
 * BTREE CACHE:
 *
 * Btree nodes are cached in memory; traversing the btree might require reading
 * in btree nodes which is handled mostly transparently.
 *
 * bch_btree_node_get() looks up a btree node in the cache and reads it in from
 * disk if necessary. This function is almost never called directly though - the
 * btree() macro is used to get a btree node, call some function on it, and
 * unlock the node after the function returns.
 *
 * The root is special cased - it's taken out of the cache's lru (thus pinning
 * it in memory), so we can find the root of the btree by just dereferencing a
 * pointer instead of looking it up in the cache. This makes locking a bit
 * tricky, since the root pointer is protected by the lock in the btree node it
 * points to - the btree_root() macro handles this.
 *
 * In various places we must be able to allocate memory for multiple btree nodes
 * in order to make forward progress. To do this we use the btree cache itself
 * as a reserve; if __get_free_pages() fails, we'll find a node in the btree
 * cache we can reuse. We can't allow more than one thread to be doing this at a
 * time, so there's a lock, implemented by a pointer to the btree_op closure -
 * this allows the btree_root() macro to implicitly release this lock.
 *
 * BTREE IO:
 *
 * Btree nodes never have to be explicitly read in; bch_btree_node_get() handles
 * this.
 *
 * For writing, we have two btree_write structs embeddded in struct btree - one
 * write in flight, and one being set up, and we toggle between them.
 *
 * Writing is done with a single function -  bch_btree_write() really serves two
 * different purposes and should be broken up into two different functions. When
 * passing now = false, it merely indicates that the node is now dirty - calling
 * it ensures that the dirty keys will be written at some point in the future.
 *
 * When passing now = true, bch_btree_write() causes a write to happen
 * "immediately" (if there was already a write in flight, it'll cause the write
 * to happen as soon as the previous write completes). It returns immediately
 * though - but it takes a refcount on the closure in struct btree_op you passed
 * to it, so a closure_sync() later can be used to wait for the write to
 * complete.
 *
 * This is handy because btree_split() and garbage collection can issue writes
 * in parallel, reducing the amount of time they have to hold write locks.
 *
 * LOCKING:
 *
 * When traversing the btree, we may need write locks starting at some level -
 * inserting a key into the btree will typically only require a write lock on
 * the leaf node.
 *
 * This is specified with the lock field in struct btree_op; lock = 0 means we
 * take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get()
 * checks this field and returns the node with the appropriate lock held.
 *
 * If, after traversing the btree, the insertion code discovers it has to split
 * then it must restart from the root and take new locks - to do this it changes
 * the lock field and returns -EINTR, which causes the btree_root() macro to
 * loop.
 *
 * Handling cache misses require a different mechanism for upgrading to a write
 * lock. We do cache lookups with only a read lock held, but if we get a cache
 * miss and we wish to insert this data into the cache, we have to insert a
 * placeholder key to detect races - otherwise, we could race with a write and
 * overwrite the data that was just written to the cache with stale data from
 * the backing device.
 *
 * For this we use a sequence number that write locks and unlocks increment - to
 * insert the check key it unlocks the btree node and then takes a write lock,
 * and fails if the sequence number doesn't match.
 */

#include "bset.h"
#include "debug.h"

struct btree_write {
	atomic_t		*journal;

	/* If btree_split() frees a btree node, it writes a new pointer to that
	 * btree node indicating it was freed; it takes a refcount on
	 * c->prio_blocked because we can't write the gens until the new
	 * pointer is on disk. This allows btree_write_endio() to release the
	 * refcount that btree_split() took.
	 */
	int			prio_blocked;
};

116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
struct btree_keys_ops {
	bool			(*sort_cmp)(struct btree_iter_set,
					    struct btree_iter_set);
	struct bkey		*(*sort_fixup)(struct btree_iter *,
					       struct bkey *);
	bool			(*key_invalid)(struct btree *,
					       const struct bkey *);
	bool			(*key_bad)(struct btree *,
					   const struct bkey *);
	bool			(*key_merge)(struct btree *,
					     struct bkey *, struct bkey *);


	/*
	 * Only used for deciding whether to use START_KEY(k) or just the key
	 * itself in a couple places
	 */
	bool		is_extents;
};

K
Kent Overstreet 已提交
136
struct btree {
137
	const struct btree_keys_ops	*ops;
K
Kent Overstreet 已提交
138 139 140 141 142 143 144 145 146 147 148
	/* Hottest entries first */
	struct hlist_node	hash;

	/* Key/pointer for this btree node */
	BKEY_PADDED(key);

	/* Single bit - set when accessed, cleared by shrinker */
	unsigned long		accessed;
	unsigned long		seq;
	struct rw_semaphore	lock;
	struct cache_set	*c;
149
	struct btree		*parent;
K
Kent Overstreet 已提交
150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165

	unsigned long		flags;
	uint16_t		written;	/* would be nice to kill */
	uint8_t			level;
	uint8_t			nsets;
	uint8_t			page_order;

	/*
	 * Set of sorted keys - the real btree node - plus a binary search tree
	 *
	 * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
	 * to the memory we have allocated for this btree node. Additionally,
	 * set[0]->data points to the entire btree node as it exists on disk.
	 */
	struct bset_tree	sets[MAX_BSETS];

K
Kent Overstreet 已提交
166
	/* For outstanding btree writes, used as a lock - protects write_idx */
167 168
	struct closure		io;
	struct semaphore	io_mutex;
K
Kent Overstreet 已提交
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203

	struct list_head	list;
	struct delayed_work	work;

	struct btree_write	writes[2];
	struct bio		*bio;
};

#define BTREE_FLAG(flag)						\
static inline bool btree_node_ ## flag(struct btree *b)			\
{	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
									\
static inline void set_btree_node_ ## flag(struct btree *b)		\
{	set_bit(BTREE_NODE_ ## flag, &b->flags); }			\

enum btree_flags {
	BTREE_NODE_io_error,
	BTREE_NODE_dirty,
	BTREE_NODE_write_idx,
};

BTREE_FLAG(io_error);
BTREE_FLAG(dirty);
BTREE_FLAG(write_idx);

static inline struct btree_write *btree_current_write(struct btree *b)
{
	return b->writes + btree_node_write_idx(b);
}

static inline struct btree_write *btree_prev_write(struct btree *b)
{
	return b->writes + (btree_node_write_idx(b) ^ 1);
}

204
static inline struct bset_tree *bset_tree_last(struct btree *b)
K
Kent Overstreet 已提交
205
{
206
	return b->sets + b->nsets;
K
Kent Overstreet 已提交
207 208
}

K
Kent Overstreet 已提交
209 210 211 212 213
static inline struct bset *btree_bset_first(struct btree *b)
{
	return b->sets->data;
}

214 215 216 217 218
static inline struct bset *btree_bset_last(struct btree *b)
{
	return bset_tree_last(b)->data;
}

K
Kent Overstreet 已提交
219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
static inline unsigned bset_byte_offset(struct btree *b, struct bset *i)
{
	return ((size_t) i) - ((size_t) b->sets->data);
}

static inline unsigned bset_sector_offset(struct btree *b, struct bset *i)
{
	return (((void *) i) - ((void *) btree_bset_first(b))) >> 9;
}

static inline unsigned bset_block_offset(struct btree *b, struct bset *i)
{
	return bset_sector_offset(b, i) >> b->c->block_bits;
}

K
Kent Overstreet 已提交
234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
static inline struct bset *write_block(struct btree *b)
{
	return ((void *) b->sets[0].data) + b->written * block_bytes(b->c);
}

static inline bool bset_written(struct btree *b, struct bset_tree *t)
{
	return t->data < write_block(b);
}

static inline bool bkey_written(struct btree *b, struct bkey *k)
{
	return k < write_block(b)->start;
}

static inline void set_gc_sectors(struct cache_set *c)
{
K
Kent Overstreet 已提交
251
	atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16);
K
Kent Overstreet 已提交
252 253
}

254 255
static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k)
{
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272
	return b->ops->key_invalid(b, k);
}

static inline bool bch_ptr_bad(struct btree *b, const struct bkey *k)
{
	return b->ops->key_bad(b, k);
}

/*
 * Tries to merge l and r: l should be lower than r
 * Returns true if we were able to merge. If we did merge, l will be the merged
 * key, r will be untouched.
 */
static inline bool bch_bkey_try_merge(struct btree *b,
				      struct bkey *l, struct bkey *r)
{
	return b->ops->key_merge ?  b->ops->key_merge(b, l, r) : false;
273 274
}

275
void bkey_put(struct cache_set *c, struct bkey *k);
276

K
Kent Overstreet 已提交
277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295
/* Looping macros */

#define for_each_cached_btree(b, c, iter)				\
	for (iter = 0;							\
	     iter < ARRAY_SIZE((c)->bucket_hash);			\
	     iter++)							\
		hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash)

#define for_each_key_filter(b, k, iter, filter)				\
	for (bch_btree_iter_init((b), (iter), NULL);			\
	     ((k) = bch_btree_iter_next_filter((iter), b, filter));)

#define for_each_key(b, k, iter)					\
	for (bch_btree_iter_init((b), (iter), NULL);			\
	     ((k) = bch_btree_iter_next(iter));)

/* Recursing down the btree */

struct btree_op {
296 297 298
	/* for waiting on btree reserve in btree_split() */
	wait_queue_t		wait;

K
Kent Overstreet 已提交
299 300 301 302 303 304
	/* Btree level at which we start taking write locks */
	short			lock;

	unsigned		insert_collision:1;
};

K
Kent Overstreet 已提交
305 306 307
static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
{
	memset(op, 0, sizeof(struct btree_op));
308
	init_wait(&op->wait);
K
Kent Overstreet 已提交
309 310
	op->lock = write_lock_level;
}
K
Kent Overstreet 已提交
311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326

static inline void rw_lock(bool w, struct btree *b, int level)
{
	w ? down_write_nested(&b->lock, level + 1)
	  : down_read_nested(&b->lock, level + 1);
	if (w)
		b->seq++;
}

static inline void rw_unlock(bool w, struct btree *b)
{
	if (w)
		b->seq++;
	(w ? up_write : up_read)(&b->lock);
}

327
void bch_btree_node_read_done(struct btree *);
K
Kent Overstreet 已提交
328
void bch_btree_node_write(struct btree *, struct closure *);
K
Kent Overstreet 已提交
329 330

void bch_btree_set_root(struct btree *);
331
struct btree *bch_btree_node_alloc(struct cache_set *, int, bool);
332
struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool);
K
Kent Overstreet 已提交
333

334 335
int bch_btree_insert_check_key(struct btree *, struct btree_op *,
			       struct bkey *);
336 337
int bch_btree_insert(struct cache_set *, struct keylist *,
		     atomic_t *, struct bkey *);
K
Kent Overstreet 已提交
338

K
Kent Overstreet 已提交
339
int bch_gc_thread_start(struct cache_set *);
K
Kent Overstreet 已提交
340
size_t bch_btree_gc_finish(struct cache_set *);
K
Kent Overstreet 已提交
341
void bch_moving_gc(struct cache_set *);
K
Kent Overstreet 已提交
342
int bch_btree_check(struct cache_set *);
K
Kent Overstreet 已提交
343 344
uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *);

K
Kent Overstreet 已提交
345 346 347 348 349 350
static inline void wake_up_gc(struct cache_set *c)
{
	if (c->gc_thread)
		wake_up_process(c->gc_thread);
}

351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
#define MAP_DONE	0
#define MAP_CONTINUE	1

#define MAP_ALL_NODES	0
#define MAP_LEAF_NODES	1

#define MAP_END_KEY	1

typedef int (btree_map_nodes_fn)(struct btree_op *, struct btree *);
int __bch_btree_map_nodes(struct btree_op *, struct cache_set *,
			  struct bkey *, btree_map_nodes_fn *, int);

static inline int bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
				      struct bkey *from, btree_map_nodes_fn *fn)
{
	return __bch_btree_map_nodes(op, c, from, fn, MAP_ALL_NODES);
}

static inline int bch_btree_map_leaf_nodes(struct btree_op *op,
					   struct cache_set *c,
					   struct bkey *from,
					   btree_map_nodes_fn *fn)
{
	return __bch_btree_map_nodes(op, c, from, fn, MAP_LEAF_NODES);
}

typedef int (btree_map_keys_fn)(struct btree_op *, struct btree *,
				struct bkey *);
int bch_btree_map_keys(struct btree_op *, struct cache_set *,
		       struct bkey *, btree_map_keys_fn *, int);

typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);

K
Kent Overstreet 已提交
384
void bch_keybuf_init(struct keybuf *);
385 386
void bch_refill_keybuf(struct cache_set *, struct keybuf *,
		       struct bkey *, keybuf_pred_fn *);
K
Kent Overstreet 已提交
387 388 389 390
bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *,
				  struct bkey *);
void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
struct keybuf_key *bch_keybuf_next(struct keybuf *);
K
Kent Overstreet 已提交
391 392
struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *,
					  struct bkey *, keybuf_pred_fn *);
K
Kent Overstreet 已提交
393 394

#endif