Merge tag 'dm-3.7-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm

Pull device-mapper changes from Alasdair G Kergon: "Remove the power-of-2 block size constraint on discards in dm thin provisioning and factor the bio_prison code out into a separate module (for sharing with the forthcoming cache target). Use struct bio's front_pad to eliminate the use of one separate mempool by bio-based devices. A few other tiny clean-ups." * tag 'dm-3.7-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm: dm: store dm_target_io in bio front_pad dm thin: move bio_prison code to separate module dm thin: prepare to separate bio_prison code dm thin: support discard with non power of two block size dm persistent data: convert to use le32_add_cpu dm: use ACCESS_ONCE for sysfs values dm bufio: use list_move dm mpath: fix check for null mpio in end_io fn

Merge tag 'dm-3.7-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm
Pull device-mapper changes from Alasdair G Kergon: "Remove the power-of-2 block size constraint on discards in dm thin provisioning and factor the bio_prison code out into a separate module (for sharing with the forthcoming cache target). Use struct bio's front_pad to eliminate the use of one separate mempool by bio-based devices. A few other tiny clean-ups." * tag 'dm-3.7-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm: dm: store dm_target_io in bio front_pad dm thin: move bio_prison code to separate module dm thin: prepare to separate bio_prison code dm thin: support discard with non power of two block size dm persistent data: convert to use le32_add_cpu dm: use ACCESS_ONCE for sysfs values dm bufio: use list_move dm mpath: fix check for null mpio in end_io fn
79c63eeb · Linus Torvalds · 6a5a3d6a · dba14160 · 79c63eeb · 79c63eeb
10 changed file
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -216,6 +216,13 @@ config DM_BUFIO
 	 as a cache, holding recently-read blocks in memory and performing
 	 delayed writes.

+config DM_BIO_PRISON
+       tristate
+       depends on BLK_DEV_DM && EXPERIMENTAL
+       ---help---
+	 Some bio locking schemes used by other device-mapper targets
+	 including thin provisioning.
+
 source "drivers/md/persistent-data/Kconfig"

 config DM_CRYPT
@@ -247,6 +254,7 @@ config DM_THIN_PROVISIONING
       tristate "Thin provisioning target (EXPERIMENTAL)"
       depends on BLK_DEV_DM && EXPERIMENTAL
       select DM_PERSISTENT_DATA
+       select DM_BIO_PRISON
       ---help---
         Provides thin provisioning and snapshots that share a data store.


--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_MD_FAULTY)		+= faulty.o
 obj-$(CONFIG_BLK_DEV_MD)	+= md-mod.o
 obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o
 obj-$(CONFIG_DM_BUFIO)		+= dm-bufio.o
+obj-$(CONFIG_DM_BIO_PRISON)	+= dm-bio-prison.o
 obj-$(CONFIG_DM_CRYPT)		+= dm-crypt.o
 obj-$(CONFIG_DM_DELAY)		+= dm-delay.o
 obj-$(CONFIG_DM_FLAKEY)		+= dm-flakey.o

--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+#include "dm-bio-prison.h"
+
+#include <linux/spinlock.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+/*----------------------------------------------------------------*/
+
+struct dm_bio_prison_cell {
+	struct hlist_node list;
+	struct dm_bio_prison *prison;
+	struct dm_cell_key key;
+	struct bio *holder;
+	struct bio_list bios;
+};
+
+struct dm_bio_prison {
+	spinlock_t lock;
+	mempool_t *cell_pool;
+
+	unsigned nr_buckets;
+	unsigned hash_mask;
+	struct hlist_head *cells;
+};
+
+/*----------------------------------------------------------------*/
+
+static uint32_t calc_nr_buckets(unsigned nr_cells)
+{
+	uint32_t n = 128;
+
+	nr_cells /= 4;
+	nr_cells = min(nr_cells, 8192u);
+
+	while (n < nr_cells)
+		n <<= 1;
+
+	return n;
+}
+
+static struct kmem_cache *_cell_cache;
+
+/*
+ * @nr_cells should be the number of cells you want in use _concurrently_.
+ * Don't confuse it with the number of distinct keys.
+ */
+struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells)
+{
+	unsigned i;
+	uint32_t nr_buckets = calc_nr_buckets(nr_cells);
+	size_t len = sizeof(struct dm_bio_prison) +
+		(sizeof(struct hlist_head) * nr_buckets);
+	struct dm_bio_prison *prison = kmalloc(len, GFP_KERNEL);
+
+	if (!prison)
+		return NULL;
+
+	spin_lock_init(&prison->lock);
+	prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache);
+	if (!prison->cell_pool) {
+		kfree(prison);
+		return NULL;
+	}
+
+	prison->nr_buckets = nr_buckets;
+	prison->hash_mask = nr_buckets - 1;
+	prison->cells = (struct hlist_head *) (prison + 1);
+	for (i = 0; i < nr_buckets; i++)
+		INIT_HLIST_HEAD(prison->cells + i);
+
+	return prison;
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_create);
+
+void dm_bio_prison_destroy(struct dm_bio_prison *prison)
+{
+	mempool_destroy(prison->cell_pool);
+	kfree(prison);
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_destroy);
+
+static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key)
+{
+	const unsigned long BIG_PRIME = 4294967291UL;
+	uint64_t hash = key->block * BIG_PRIME;
+
+	return (uint32_t) (hash & prison->hash_mask);
+}
+
+static int keys_equal(struct dm_cell_key *lhs, struct dm_cell_key *rhs)
+{
+	       return (lhs->virtual == rhs->virtual) &&
+		       (lhs->dev == rhs->dev) &&
+		       (lhs->block == rhs->block);
+}
+
+static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket,
+						  struct dm_cell_key *key)
+{
+	struct dm_bio_prison_cell *cell;
+	struct hlist_node *tmp;
+
+	hlist_for_each_entry(cell, tmp, bucket, list)
+		if (keys_equal(&cell->key, key))
+			return cell;
+
+	return NULL;
+}
+
+/*
+ * This may block if a new cell needs allocating.  You must ensure that
+ * cells will be unlocked even if the calling thread is blocked.
+ *
+ * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
+ */
+int dm_bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key,
+		  struct bio *inmate, struct dm_bio_prison_cell **ref)
+{
+	int r = 1;
+	unsigned long flags;
+	uint32_t hash = hash_key(prison, key);
+	struct dm_bio_prison_cell *cell, *cell2;
+
+	BUG_ON(hash > prison->nr_buckets);
+
+	spin_lock_irqsave(&prison->lock, flags);
+
+	cell = __search_bucket(prison->cells + hash, key);
+	if (cell) {
+		bio_list_add(&cell->bios, inmate);
+		goto out;
+	}
+
+	/*
+	 * Allocate a new cell
+	 */
+	spin_unlock_irqrestore(&prison->lock, flags);
+	cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
+	spin_lock_irqsave(&prison->lock, flags);
+
+	/*
+	 * We've been unlocked, so we have to double check that
+	 * nobody else has inserted this cell in the meantime.
+	 */
+	cell = __search_bucket(prison->cells + hash, key);
+	if (cell) {
+		mempool_free(cell2, prison->cell_pool);
+		bio_list_add(&cell->bios, inmate);
+		goto out;
+	}
+
+	/*
+	 * Use new cell.
+	 */
+	cell = cell2;
+
+	cell->prison = prison;
+	memcpy(&cell->key, key, sizeof(cell->key));
+	cell->holder = inmate;
+	bio_list_init(&cell->bios);
+	hlist_add_head(&cell->list, prison->cells + hash);
+
+	r = 0;
+
+out:
+	spin_unlock_irqrestore(&prison->lock, flags);
+
+	*ref = cell;
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_bio_detain);
+
+/*
+ * @inmates must have been initialised prior to this call
+ */
+static void __cell_release(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
+{
+	struct dm_bio_prison *prison = cell->prison;
+
+	hlist_del(&cell->list);
+
+	if (inmates) {
+		bio_list_add(inmates, cell->holder);
+		bio_list_merge(inmates, &cell->bios);
+	}
+
+	mempool_free(cell, prison->cell_pool);
+}
+
+void dm_cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios)
+{
+	unsigned long flags;
+	struct dm_bio_prison *prison = cell->prison;
+
+	spin_lock_irqsave(&prison->lock, flags);
+	__cell_release(cell, bios);
+	spin_unlock_irqrestore(&prison->lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_cell_release);
+
+/*
+ * There are a couple of places where we put a bio into a cell briefly
+ * before taking it out again.  In these situations we know that no other
+ * bio may be in the cell.  This function releases the cell, and also does
+ * a sanity check.
+ */
+static void __cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio)
+{
+	BUG_ON(cell->holder != bio);
+	BUG_ON(!bio_list_empty(&cell->bios));
+
+	__cell_release(cell, NULL);
+}
+
+void dm_cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio)
+{
+	unsigned long flags;
+	struct dm_bio_prison *prison = cell->prison;
+
+	spin_lock_irqsave(&prison->lock, flags);
+	__cell_release_singleton(cell, bio);
+	spin_unlock_irqrestore(&prison->lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_cell_release_singleton);
+
+/*
+ * Sometimes we don't want the holder, just the additional bios.
+ */
+static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
+{
+	struct dm_bio_prison *prison = cell->prison;
+
+	hlist_del(&cell->list);
+	bio_list_merge(inmates, &cell->bios);
+
+	mempool_free(cell, prison->cell_pool);
+}
+
+void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
+{
+	unsigned long flags;
+	struct dm_bio_prison *prison = cell->prison;
+
+	spin_lock_irqsave(&prison->lock, flags);
+	__cell_release_no_holder(cell, inmates);
+	spin_unlock_irqrestore(&prison->lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
+
+void dm_cell_error(struct dm_bio_prison_cell *cell)
+{
+	struct dm_bio_prison *prison = cell->prison;
+	struct bio_list bios;
+	struct bio *bio;
+	unsigned long flags;
+
+	bio_list_init(&bios);
+
+	spin_lock_irqsave(&prison->lock, flags);
+	__cell_release(cell, &bios);
+	spin_unlock_irqrestore(&prison->lock, flags);
+
+	while ((bio = bio_list_pop(&bios)))
+		bio_io_error(bio);
+}
+EXPORT_SYMBOL_GPL(dm_cell_error);
+
+/*----------------------------------------------------------------*/
+
+#define DEFERRED_SET_SIZE 64
+
+struct dm_deferred_entry {
+	struct dm_deferred_set *ds;
+	unsigned count;
+	struct list_head work_items;
+};
+
+struct dm_deferred_set {
+	spinlock_t lock;
+	unsigned current_entry;
+	unsigned sweeper;
+	struct dm_deferred_entry entries[DEFERRED_SET_SIZE];
+};
+
+struct dm_deferred_set *dm_deferred_set_create(void)
+{
+	int i;
+	struct dm_deferred_set *ds;
+
+	ds = kmalloc(sizeof(*ds), GFP_KERNEL);
+	if (!ds)
+		return NULL;
+
+	spin_lock_init(&ds->lock);
+	ds->current_entry = 0;
+	ds->sweeper = 0;
+	for (i = 0; i < DEFERRED_SET_SIZE; i++) {
+		ds->entries[i].ds = ds;
+		ds->entries[i].count = 0;
+		INIT_LIST_HEAD(&ds->entries[i].work_items);
+	}
+
+	return ds;
+}
+EXPORT_SYMBOL_GPL(dm_deferred_set_create);
+
+void dm_deferred_set_destroy(struct dm_deferred_set *ds)
+{
+	kfree(ds);
+}
+EXPORT_SYMBOL_GPL(dm_deferred_set_destroy);
+
+struct dm_deferred_entry *dm_deferred_entry_inc(struct dm_deferred_set *ds)
+{
+	unsigned long flags;
+	struct dm_deferred_entry *entry;
+
+	spin_lock_irqsave(&ds->lock, flags);
+	entry = ds->entries + ds->current_entry;
+	entry->count++;
+	spin_unlock_irqrestore(&ds->lock, flags);
+
+	return entry;
+}
+EXPORT_SYMBOL_GPL(dm_deferred_entry_inc);
+
+static unsigned ds_next(unsigned index)
+{
+	return (index + 1) % DEFERRED_SET_SIZE;
+}
+
+static void __sweep(struct dm_deferred_set *ds, struct list_head *head)
+{
+	while ((ds->sweeper != ds->current_entry) &&
+	       !ds->entries[ds->sweeper].count) {
+		list_splice_init(&ds->entries[ds->sweeper].work_items, head);
+		ds->sweeper = ds_next(ds->sweeper);
+	}
+
+	if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count)
+		list_splice_init(&ds->entries[ds->sweeper].work_items, head);
+}
+
+void dm_deferred_entry_dec(struct dm_deferred_entry *entry, struct list_head *head)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&entry->ds->lock, flags);
+	BUG_ON(!entry->count);
+	--entry->count;
+	__sweep(entry->ds, head);
+	spin_unlock_irqrestore(&entry->ds->lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_deferred_entry_dec);
+
+/*
+ * Returns 1 if deferred or 0 if no pending items to delay job.
+ */
+int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work)
+{
+	int r = 1;
+	unsigned long flags;
+	unsigned next_entry;
+
+	spin_lock_irqsave(&ds->lock, flags);
+	if ((ds->sweeper == ds->current_entry) &&
+	    !ds->entries[ds->current_entry].count)
+		r = 0;
+	else {
+		list_add(work, &ds->entries[ds->current_entry].work_items);
+		next_entry = ds_next(ds->current_entry);
+		if (!ds->entries[next_entry].count)
+			ds->current_entry = next_entry;
+	}
+	spin_unlock_irqrestore(&ds->lock, flags);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_deferred_set_add_work);
+
+/*----------------------------------------------------------------*/
+
+static int __init dm_bio_prison_init(void)
+{
+	_cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0);
+	if (!_cell_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void __exit dm_bio_prison_exit(void)
+{
+	kmem_cache_destroy(_cell_cache);
+	_cell_cache = NULL;
+}
+
+/*
+ * module hooks
+ */
+module_init(dm_bio_prison_init);
+module_exit(dm_bio_prison_exit);
+
+MODULE_DESCRIPTION(DM_NAME " bio prison");
+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
+/*
+ * Copyright (C) 2011-2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_BIO_PRISON_H
+#define DM_BIO_PRISON_H
+
+#include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */
+#include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */
+
+#include <linux/list.h>
+#include <linux/bio.h>
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Sometimes we can't deal with a bio straight away.  We put them in prison
+ * where they can't cause any mischief.  Bios are put in a cell identified
+ * by a key, multiple bios can be in the same cell.  When the cell is
+ * subsequently unlocked the bios become available.
+ */
+struct dm_bio_prison;
+struct dm_bio_prison_cell;
+
+/* FIXME: this needs to be more abstract */
+struct dm_cell_key {
+	int virtual;
+	dm_thin_id dev;
+	dm_block_t block;
+};
+
+struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells);
+void dm_bio_prison_destroy(struct dm_bio_prison *prison);
+
+/*
+ * This may block if a new cell needs allocating.  You must ensure that
+ * cells will be unlocked even if the calling thread is blocked.
+ *
+ * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
+ */
+int dm_bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key,
+		  struct bio *inmate, struct dm_bio_prison_cell **ref);
+
+void dm_cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios);
+void dm_cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio); // FIXME: bio arg not needed
+void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates);
+void dm_cell_error(struct dm_bio_prison_cell *cell);
+
+/*----------------------------------------------------------------*/
+
+/*
+ * We use the deferred set to keep track of pending reads to shared blocks.
+ * We do this to ensure the new mapping caused by a write isn't performed
+ * until these prior reads have completed.  Otherwise the insertion of the
+ * new mapping could free the old block that the read bios are mapped to.
+ */
+
+struct dm_deferred_set;
+struct dm_deferred_entry;
+
+struct dm_deferred_set *dm_deferred_set_create(void);
+void dm_deferred_set_destroy(struct dm_deferred_set *ds);
+
+struct dm_deferred_entry *dm_deferred_entry_inc(struct dm_deferred_set *ds);
+void dm_deferred_entry_dec(struct dm_deferred_entry *entry, struct list_head *head);
+int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work);
+
+/*----------------------------------------------------------------*/
+
+#endif
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -280,9 +280,7 @@ static void __cache_size_refresh(void)
 	BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
 	BUG_ON(dm_bufio_client_count < 0);

-	dm_bufio_cache_size_latch = dm_bufio_cache_size;
-
-	barrier();
+	dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size);

 	/*
 	 * Use default if set to 0 and report the actual cache size used.
@@ -441,8 +439,7 @@ static void __relink_lru(struct dm_buffer *b, int dirty)
 	c->n_buffers[b->list_mode]--;
 	c->n_buffers[dirty]++;
 	b->list_mode = dirty;
-	list_del(&b->lru_list);
-	list_add(&b->lru_list, &c->lru[dirty]);
+	list_move(&b->lru_list, &c->lru[dirty]);
 }

 /*----------------------------------------------------------------
@@ -813,7 +810,7 @@ static void __get_memory_limit(struct dm_bufio_client *c,
 {
 	unsigned long buffers;

-	if (dm_bufio_cache_size != dm_bufio_cache_size_latch) {
+	if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) {
 		mutex_lock(&dm_bufio_clients_lock);
 		__cache_size_refresh();
 		mutex_unlock(&dm_bufio_clients_lock);
@@ -1591,11 +1588,9 @@ EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);

 static void cleanup_old_buffers(void)
 {
-	unsigned long max_age = dm_bufio_max_age;
+	unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age);
 	struct dm_bufio_client *c;

-	barrier();
-
 	if (max_age > ULONG_MAX / HZ)
 		max_age = ULONG_MAX / HZ;


--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1309,13 +1309,14 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
 {
 	struct multipath *m = ti->private;
 	struct dm_mpath_io *mpio = map_context->ptr;
-	struct pgpath *pgpath = mpio->pgpath;
+	struct pgpath *pgpath;
 	struct path_selector *ps;
 	int r;

 	BUG_ON(!mpio);

 	r  = do_end_io(m, clone, error, mpio);
+	pgpath = mpio->pgpath;
 	if (pgpath) {
 		ps = &pgpath->pg->ps;
 		if (ps->type->end_io)

--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -5,6 +5,7 @@
 */

 #include "dm-thin-metadata.h"
+#include "dm-bio-prison.h"
 #include "dm.h"

 #include <linux/device-mapper.h>
@@ -21,7 +22,6 @@
 * Tunable constants
 */
 #define ENDIO_HOOK_POOL_SIZE 1024
-#define DEFERRED_SET_SIZE 64
 #define MAPPING_POOL_SIZE 1024
 #define PRISON_CELLS 1024
 #define COMMIT_PERIOD HZ
@@ -58,7 +58,7 @@
 * i) plug io further to this physical block. (see bio_prison code).
 *
 * ii) quiesce any read io to that shared data block.  Obviously
- * including all devices that share this block.  (see deferred_set code)
+ * including all devices that share this block.  (see dm_deferred_set code)
 *
 * iii) copy the data block to a newly allocate block.  This step can be
 * missed out if the io covers the block. (schedule_copy).
@@ -98,382 +98,11 @@

 /*----------------------------------------------------------------*/

-/*
- * Sometimes we can't deal with a bio straight away.  We put them in prison
- * where they can't cause any mischief.  Bios are put in a cell identified
- * by a key, multiple bios can be in the same cell.  When the cell is
- * subsequently unlocked the bios become available.
- */
-struct bio_prison;
-
-struct cell_key {
-	int virtual;
-	dm_thin_id dev;
-	dm_block_t block;
-};
-
-struct dm_bio_prison_cell {
-	struct hlist_node list;
-	struct bio_prison *prison;
-	struct cell_key key;
-	struct bio *holder;
-	struct bio_list bios;
-};
-
-struct bio_prison {
-	spinlock_t lock;
-	mempool_t *cell_pool;
-
-	unsigned nr_buckets;
-	unsigned hash_mask;
-	struct hlist_head *cells;
-};
-
-static uint32_t calc_nr_buckets(unsigned nr_cells)
-{
-	uint32_t n = 128;
-
-	nr_cells /= 4;
-	nr_cells = min(nr_cells, 8192u);
-
-	while (n < nr_cells)
-		n <<= 1;
-
-	return n;
-}
-
-static struct kmem_cache *_cell_cache;
-
-/*
- * @nr_cells should be the number of cells you want in use _concurrently_.
- * Don't confuse it with the number of distinct keys.
- */
-static struct bio_prison *prison_create(unsigned nr_cells)
-{
-	unsigned i;
-	uint32_t nr_buckets = calc_nr_buckets(nr_cells);
-	size_t len = sizeof(struct bio_prison) +
-		(sizeof(struct hlist_head) * nr_buckets);
-	struct bio_prison *prison = kmalloc(len, GFP_KERNEL);
-
-	if (!prison)
-		return NULL;
-
-	spin_lock_init(&prison->lock);
-	prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache);
-	if (!prison->cell_pool) {
-		kfree(prison);
-		return NULL;
-	}
-
-	prison->nr_buckets = nr_buckets;
-	prison->hash_mask = nr_buckets - 1;
-	prison->cells = (struct hlist_head *) (prison + 1);
-	for (i = 0; i < nr_buckets; i++)
-		INIT_HLIST_HEAD(prison->cells + i);
-
-	return prison;
-}
-
-static void prison_destroy(struct bio_prison *prison)
-{
-	mempool_destroy(prison->cell_pool);
-	kfree(prison);
-}
-
-static uint32_t hash_key(struct bio_prison *prison, struct cell_key *key)
-{
-	const unsigned long BIG_PRIME = 4294967291UL;
-	uint64_t hash = key->block * BIG_PRIME;
-
-	return (uint32_t) (hash & prison->hash_mask);
-}
-
-static int keys_equal(struct cell_key *lhs, struct cell_key *rhs)
-{
-	       return (lhs->virtual == rhs->virtual) &&
-		       (lhs->dev == rhs->dev) &&
-		       (lhs->block == rhs->block);
-}
-
-static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket,
-						  struct cell_key *key)
-{
-	struct dm_bio_prison_cell *cell;
-	struct hlist_node *tmp;
-
-	hlist_for_each_entry(cell, tmp, bucket, list)
-		if (keys_equal(&cell->key, key))
-			return cell;
-
-	return NULL;
-}
-
-/*
- * This may block if a new cell needs allocating.  You must ensure that
- * cells will be unlocked even if the calling thread is blocked.
- *
- * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
- */
-static int bio_detain(struct bio_prison *prison, struct cell_key *key,
-		      struct bio *inmate, struct dm_bio_prison_cell **ref)
-{
-	int r = 1;
-	unsigned long flags;
-	uint32_t hash = hash_key(prison, key);
-	struct dm_bio_prison_cell *cell, *cell2;
-
-	BUG_ON(hash > prison->nr_buckets);
-
-	spin_lock_irqsave(&prison->lock, flags);
-
-	cell = __search_bucket(prison->cells + hash, key);
-	if (cell) {
-		bio_list_add(&cell->bios, inmate);
-		goto out;
-	}
-
-	/*
-	 * Allocate a new cell
-	 */
-	spin_unlock_irqrestore(&prison->lock, flags);
-	cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
-	spin_lock_irqsave(&prison->lock, flags);
-
-	/*
-	 * We've been unlocked, so we have to double check that
-	 * nobody else has inserted this cell in the meantime.
-	 */
-	cell = __search_bucket(prison->cells + hash, key);
-	if (cell) {
-		mempool_free(cell2, prison->cell_pool);
-		bio_list_add(&cell->bios, inmate);
-		goto out;
-	}
-
-	/*
-	 * Use new cell.
-	 */
-	cell = cell2;
-
-	cell->prison = prison;
-	memcpy(&cell->key, key, sizeof(cell->key));
-	cell->holder = inmate;
-	bio_list_init(&cell->bios);
-	hlist_add_head(&cell->list, prison->cells + hash);
-
-	r = 0;
-
-out:
-	spin_unlock_irqrestore(&prison->lock, flags);
-
-	*ref = cell;
-
-	return r;
-}
-
-/*
- * @inmates must have been initialised prior to this call
- */
-static void __cell_release(struct dm_bio_prison_cell *cell, struct bio_list *inmates)
-{
-	struct bio_prison *prison = cell->prison;
-
-	hlist_del(&cell->list);
-
-	if (inmates) {
-		bio_list_add(inmates, cell->holder);
-		bio_list_merge(inmates, &cell->bios);
-	}
-
-	mempool_free(cell, prison->cell_pool);
-}
-
-static void cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios)
-{
-	unsigned long flags;
-	struct bio_prison *prison = cell->prison;
-
-	spin_lock_irqsave(&prison->lock, flags);
-	__cell_release(cell, bios);
-	spin_unlock_irqrestore(&prison->lock, flags);
-}
-
-/*
- * There are a couple of places where we put a bio into a cell briefly
- * before taking it out again.  In these situations we know that no other
- * bio may be in the cell.  This function releases the cell, and also does
- * a sanity check.
- */
-static void __cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio)
-{
-	BUG_ON(cell->holder != bio);
-	BUG_ON(!bio_list_empty(&cell->bios));
-
-	__cell_release(cell, NULL);
-}
-
-static void cell_release_singleton(struct dm_bio_prison_cell *cell, struct bio *bio)
-{
-	unsigned long flags;
-	struct bio_prison *prison = cell->prison;
-
-	spin_lock_irqsave(&prison->lock, flags);
-	__cell_release_singleton(cell, bio);
-	spin_unlock_irqrestore(&prison->lock, flags);
-}
-
-/*
- * Sometimes we don't want the holder, just the additional bios.
- */
-static void __cell_release_no_holder(struct dm_bio_prison_cell *cell,
-				     struct bio_list *inmates)
-{
-	struct bio_prison *prison = cell->prison;
-
-	hlist_del(&cell->list);
-	bio_list_merge(inmates, &cell->bios);
-
-	mempool_free(cell, prison->cell_pool);
-}
-
-static void cell_release_no_holder(struct dm_bio_prison_cell *cell,
-				   struct bio_list *inmates)
-{
-	unsigned long flags;
-	struct bio_prison *prison = cell->prison;
-
-	spin_lock_irqsave(&prison->lock, flags);
-	__cell_release_no_holder(cell, inmates);
-	spin_unlock_irqrestore(&prison->lock, flags);
-}
-
-static void cell_error(struct dm_bio_prison_cell *cell)
-{
-	struct bio_prison *prison = cell->prison;
-	struct bio_list bios;
-	struct bio *bio;
-	unsigned long flags;
-
-	bio_list_init(&bios);
-
-	spin_lock_irqsave(&prison->lock, flags);
-	__cell_release(cell, &bios);
-	spin_unlock_irqrestore(&prison->lock, flags);
-
-	while ((bio = bio_list_pop(&bios)))
-		bio_io_error(bio);
-}
-
-/*----------------------------------------------------------------*/
-
-/*
- * We use the deferred set to keep track of pending reads to shared blocks.
- * We do this to ensure the new mapping caused by a write isn't performed
- * until these prior reads have completed.  Otherwise the insertion of the
- * new mapping could free the old block that the read bios are mapped to.
- */
-
-struct deferred_set;
-struct deferred_entry {
-	struct deferred_set *ds;
-	unsigned count;
-	struct list_head work_items;
-};
-
-struct deferred_set {
-	spinlock_t lock;
-	unsigned current_entry;
-	unsigned sweeper;
-	struct deferred_entry entries[DEFERRED_SET_SIZE];
-};
-
-static void ds_init(struct deferred_set *ds)
-{
-	int i;
-
-	spin_lock_init(&ds->lock);
-	ds->current_entry = 0;
-	ds->sweeper = 0;
-	for (i = 0; i < DEFERRED_SET_SIZE; i++) {
-		ds->entries[i].ds = ds;
-		ds->entries[i].count = 0;
-		INIT_LIST_HEAD(&ds->entries[i].work_items);
-	}
-}
-
-static struct deferred_entry *ds_inc(struct deferred_set *ds)
-{
-	unsigned long flags;
-	struct deferred_entry *entry;
-
-	spin_lock_irqsave(&ds->lock, flags);
-	entry = ds->entries + ds->current_entry;
-	entry->count++;
-	spin_unlock_irqrestore(&ds->lock, flags);
-
-	return entry;
-}
-
-static unsigned ds_next(unsigned index)
-{
-	return (index + 1) % DEFERRED_SET_SIZE;
-}
-
-static void __sweep(struct deferred_set *ds, struct list_head *head)
-{
-	while ((ds->sweeper != ds->current_entry) &&
-	       !ds->entries[ds->sweeper].count) {
-		list_splice_init(&ds->entries[ds->sweeper].work_items, head);
-		ds->sweeper = ds_next(ds->sweeper);
-	}
-
-	if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count)
-		list_splice_init(&ds->entries[ds->sweeper].work_items, head);
-}
-
-static void ds_dec(struct deferred_entry *entry, struct list_head *head)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&entry->ds->lock, flags);
-	BUG_ON(!entry->count);
-	--entry->count;
-	__sweep(entry->ds, head);
-	spin_unlock_irqrestore(&entry->ds->lock, flags);
-}
-
-/*
- * Returns 1 if deferred or 0 if no pending items to delay job.
- */
-static int ds_add_work(struct deferred_set *ds, struct list_head *work)
-{
-	int r = 1;
-	unsigned long flags;
-	unsigned next_entry;
-
-	spin_lock_irqsave(&ds->lock, flags);
-	if ((ds->sweeper == ds->current_entry) &&
-	    !ds->entries[ds->current_entry].count)
-		r = 0;
-	else {
-		list_add(work, &ds->entries[ds->current_entry].work_items);
-		next_entry = ds_next(ds->current_entry);
-		if (!ds->entries[next_entry].count)
-			ds->current_entry = next_entry;
-	}
-	spin_unlock_irqrestore(&ds->lock, flags);
-
-	return r;
-}
-
-/*----------------------------------------------------------------*/
-
 /*
 * Key building.
 */
 static void build_data_key(struct dm_thin_device *td,
-			   dm_block_t b, struct cell_key *key)
+			   dm_block_t b, struct dm_cell_key *key)
 {
 	key->virtual = 0;
 	key->dev = dm_thin_dev_id(td);
@@ -481,7 +110,7 @@ static void build_data_key(struct dm_thin_device *td,
 }

 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
-			      struct cell_key *key)
+			      struct dm_cell_key *key)
 {
 	key->virtual = 1;
 	key->dev = dm_thin_dev_id(td);
@@ -534,7 +163,7 @@ struct pool {
 	unsigned low_water_triggered:1;	/* A dm event has been sent */
 	unsigned no_free_space:1;	/* A -ENOSPC warning has been issued */

-	struct bio_prison *prison;
+	struct dm_bio_prison *prison;
 	struct dm_kcopyd_client *copier;

 	struct workqueue_struct *wq;
@@ -552,8 +181,8 @@ struct pool {

 	struct bio_list retry_on_resume_list;

-	struct deferred_set shared_read_ds;
-	struct deferred_set all_io_ds;
+	struct dm_deferred_set *shared_read_ds;
+	struct dm_deferred_set *all_io_ds;

 	struct dm_thin_new_mapping *next_mapping;
 	mempool_t *mapping_pool;
@@ -660,8 +289,8 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev

 struct dm_thin_endio_hook {
 	struct thin_c *tc;
-	struct deferred_entry *shared_read_entry;
-	struct deferred_entry *all_io_entry;
+	struct dm_deferred_entry *shared_read_entry;
+	struct dm_deferred_entry *all_io_entry;
 	struct dm_thin_new_mapping *overwrite_mapping;
 };

@@ -877,7 +506,7 @@ static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell,
 	unsigned long flags;

 	spin_lock_irqsave(&pool->lock, flags);
-	cell_release(cell, &pool->deferred_bios);
+	dm_cell_release(cell, &pool->deferred_bios);
 	spin_unlock_irqrestore(&tc->pool->lock, flags);

 	wake_worker(pool);
@@ -896,7 +525,7 @@ static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell
 	bio_list_init(&bios);

 	spin_lock_irqsave(&pool->lock, flags);
-	cell_release_no_holder(cell, &pool->deferred_bios);
+	dm_cell_release_no_holder(cell, &pool->deferred_bios);
 	spin_unlock_irqrestore(&pool->lock, flags);

 	wake_worker(pool);
@@ -906,7 +535,7 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 {
 	if (m->bio)
 		m->bio->bi_end_io = m->saved_bi_end_io;
-	cell_error(m->cell);
+	dm_cell_error(m->cell);
 	list_del(&m->list);
 	mempool_free(m, m->tc->pool->mapping_pool);
 }
@@ -921,7 +550,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 		bio->bi_end_io = m->saved_bi_end_io;

 	if (m->err) {
-		cell_error(m->cell);
+		dm_cell_error(m->cell);
 		goto out;
 	}

@@ -933,7 +562,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
 	if (r) {
 		DMERR("dm_thin_insert_block() failed");
-		cell_error(m->cell);
+		dm_cell_error(m->cell);
 		goto out;
 	}

@@ -1067,7 +696,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 	m->err = 0;
 	m->bio = NULL;

-	if (!ds_add_work(&pool->shared_read_ds, &m->list))
+	if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
 		m->quiesced = 1;

 	/*
@@ -1099,7 +728,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 		if (r < 0) {
 			mempool_free(m, pool->mapping_pool);
 			DMERR("dm_kcopyd_copy() failed");
-			cell_error(cell);
+			dm_cell_error(cell);
 		}
 	}
 }
@@ -1164,7 +793,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 		if (r < 0) {
 			mempool_free(m, pool->mapping_pool);
 			DMERR("dm_kcopyd_zero() failed");
-			cell_error(cell);
+			dm_cell_error(cell);
 		}
 	}
 }
@@ -1276,7 +905,7 @@ static void no_space(struct dm_bio_prison_cell *cell)
 	struct bio_list bios;

 	bio_list_init(&bios);
-	cell_release(cell, &bios);
+	dm_cell_release(cell, &bios);

 	while ((bio = bio_list_pop(&bios)))
 		retry_on_resume(bio);
@@ -1288,13 +917,13 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 	unsigned long flags;
 	struct pool *pool = tc->pool;
 	struct dm_bio_prison_cell *cell, *cell2;
-	struct cell_key key, key2;
+	struct dm_cell_key key, key2;
 	dm_block_t block = get_bio_block(tc, bio);
 	struct dm_thin_lookup_result lookup_result;
 	struct dm_thin_new_mapping *m;

 	build_virtual_key(tc->td, block, &key);
-	if (bio_detain(tc->pool->prison, &key, bio, &cell))
+	if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
 		return;

 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
@@ -1306,8 +935,8 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 		 * on this block.
 		 */
 		build_data_key(tc->td, lookup_result.block, &key2);
-		if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
-			cell_release_singleton(cell, bio);
+		if (dm_bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
+			dm_cell_release_singleton(cell, bio);
 			break;
 		}

@@ -1326,7 +955,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 			m->err = 0;
 			m->bio = bio;

-			if (!ds_add_work(&pool->all_io_ds, &m->list)) {
+			if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
 				spin_lock_irqsave(&pool->lock, flags);
 				list_add(&m->list, &pool->prepared_discards);
 				spin_unlock_irqrestore(&pool->lock, flags);
@@ -1338,8 +967,8 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 			 * a block boundary.  So we submit the discard of a
 			 * partial block appropriately.
 			 */
-			cell_release_singleton(cell, bio);
-			cell_release_singleton(cell2, bio);
+			dm_cell_release_singleton(cell, bio);
+			dm_cell_release_singleton(cell2, bio);
 			if ((!lookup_result.shared) && pool->pf.discard_passdown)
 				remap_and_issue(tc, bio, lookup_result.block);
 			else
@@ -1351,20 +980,20 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 		/*
 		 * It isn't provisioned, just forget it.
 		 */
-		cell_release_singleton(cell, bio);
+		dm_cell_release_singleton(cell, bio);
 		bio_endio(bio, 0);
 		break;

 	default:
 		DMERR("discard: find block unexpectedly returned %d", r);
-		cell_release_singleton(cell, bio);
+		dm_cell_release_singleton(cell, bio);
 		bio_io_error(bio);
 		break;
 	}
 }

 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
-			  struct cell_key *key,
+			  struct dm_cell_key *key,
 			  struct dm_thin_lookup_result *lookup_result,
 			  struct dm_bio_prison_cell *cell)
 {
@@ -1384,7 +1013,7 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,

 	default:
 		DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
-		cell_error(cell);
+		dm_cell_error(cell);
 		break;
 	}
 }
@@ -1395,14 +1024,14 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
 {
 	struct dm_bio_prison_cell *cell;
 	struct pool *pool = tc->pool;
-	struct cell_key key;
+	struct dm_cell_key key;

 	/*
 	 * If cell is already occupied, then sharing is already in the process
 	 * of being broken so we have nothing further to do here.
 	 */
 	build_data_key(tc->td, lookup_result->block, &key);
-	if (bio_detain(pool->prison, &key, bio, &cell))
+	if (dm_bio_detain(pool->prison, &key, bio, &cell))
 		return;

 	if (bio_data_dir(bio) == WRITE && bio->bi_size)
@@ -1410,9 +1039,9 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
 	else {
 		struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;

-		h->shared_read_entry = ds_inc(&pool->shared_read_ds);
+		h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);

-		cell_release_singleton(cell, bio);
+		dm_cell_release_singleton(cell, bio);
 		remap_and_issue(tc, bio, lookup_result->block);
 	}
 }
@@ -1427,7 +1056,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
 	 * Remap empty bios (flushes) immediately, without provisioning.
 	 */
 	if (!bio->bi_size) {
-		cell_release_singleton(cell, bio);
+		dm_cell_release_singleton(cell, bio);
 		remap_and_issue(tc, bio, 0);
 		return;
 	}
@@ -1437,7 +1066,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
 	 */
 	if (bio_data_dir(bio) == READ) {
 		zero_fill_bio(bio);
-		cell_release_singleton(cell, bio);
+		dm_cell_release_singleton(cell, bio);
 		bio_endio(bio, 0);
 		return;
 	}
@@ -1458,7 +1087,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
 	default:
 		DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
 		set_pool_mode(tc->pool, PM_READ_ONLY);
-		cell_error(cell);
+		dm_cell_error(cell);
 		break;
 	}
 }
@@ -1468,7 +1097,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
 	int r;
 	dm_block_t block = get_bio_block(tc, bio);
 	struct dm_bio_prison_cell *cell;
-	struct cell_key key;
+	struct dm_cell_key key;
 	struct dm_thin_lookup_result lookup_result;

 	/*
@@ -1476,7 +1105,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
 	 * being provisioned so we have nothing further to do here.
 	 */
 	build_virtual_key(tc->td, block, &key);
-	if (bio_detain(tc->pool->prison, &key, bio, &cell))
+	if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
 		return;

 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
@@ -1491,7 +1120,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
 		 * TODO: this will probably have to change when discard goes
 		 * back in.
 		 */
-		cell_release_singleton(cell, bio);
+		dm_cell_release_singleton(cell, bio);

 		if (lookup_result.shared)
 			process_shared_bio(tc, bio, block, &lookup_result);
@@ -1501,7 +1130,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio)

 	case -ENODATA:
 		if (bio_data_dir(bio) == READ && tc->origin_dev) {
-			cell_release_singleton(cell, bio);
+			dm_cell_release_singleton(cell, bio);
 			remap_to_origin_and_issue(tc, bio);
 		} else
 			provision_block(tc, bio, block, cell);
@@ -1509,7 +1138,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio)

 	default:
 		DMERR("dm_thin_find_block() failed, error = %d", r);
-		cell_release_singleton(cell, bio);
+		dm_cell_release_singleton(cell, bio);
 		bio_io_error(bio);
 		break;
 	}
@@ -1718,7 +1347,7 @@ static struct dm_thin_endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *b

 	h->tc = tc;
 	h->shared_read_entry = NULL;
-	h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds);
+	h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : dm_deferred_entry_inc(pool->all_io_ds);
 	h->overwrite_mapping = NULL;

 	return h;
@@ -1928,7 +1557,7 @@ static void __pool_destroy(struct pool *pool)
 	if (dm_pool_metadata_close(pool->pmd) < 0)
 		DMWARN("%s: dm_pool_metadata_close() failed.", __func__);

-	prison_destroy(pool->prison);
+	dm_bio_prison_destroy(pool->prison);
 	dm_kcopyd_client_destroy(pool->copier);

 	if (pool->wq)
@@ -1938,6 +1567,8 @@ static void __pool_destroy(struct pool *pool)
 		mempool_free(pool->next_mapping, pool->mapping_pool);
 	mempool_destroy(pool->mapping_pool);
 	mempool_destroy(pool->endio_hook_pool);
+	dm_deferred_set_destroy(pool->shared_read_ds);
+	dm_deferred_set_destroy(pool->all_io_ds);
 	kfree(pool);
 }

@@ -1976,7 +1607,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 		pool->sectors_per_block_shift = __ffs(block_size);
 	pool->low_water_blocks = 0;
 	pool_features_init(&pool->pf);
-	pool->prison = prison_create(PRISON_CELLS);
+	pool->prison = dm_bio_prison_create(PRISON_CELLS);
 	if (!pool->prison) {
 		*error = "Error creating pool's bio prison";
 		err_p = ERR_PTR(-ENOMEM);
@@ -2012,8 +1643,20 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 	pool->low_water_triggered = 0;
 	pool->no_free_space = 0;
 	bio_list_init(&pool->retry_on_resume_list);
-	ds_init(&pool->shared_read_ds);
-	ds_init(&pool->all_io_ds);
+
+	pool->shared_read_ds = dm_deferred_set_create();
+	if (!pool->shared_read_ds) {
+		*error = "Error creating pool's shared read deferred set";
+		err_p = ERR_PTR(-ENOMEM);
+		goto bad_shared_read_ds;
+	}
+
+	pool->all_io_ds = dm_deferred_set_create();
+	if (!pool->all_io_ds) {
+		*error = "Error creating pool's all io deferred set";
+		err_p = ERR_PTR(-ENOMEM);
+		goto bad_all_io_ds;
+	}

 	pool->next_mapping = NULL;
 	pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
@@ -2042,11 +1685,15 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 bad_endio_hook_pool:
 	mempool_destroy(pool->mapping_pool);
 bad_mapping_pool:
+	dm_deferred_set_destroy(pool->all_io_ds);
+bad_all_io_ds:
+	dm_deferred_set_destroy(pool->shared_read_ds);
+bad_shared_read_ds:
 	destroy_workqueue(pool->wq);
 bad_wq:
 	dm_kcopyd_client_destroy(pool->copier);
 bad_kcopyd_client:
-	prison_destroy(pool->prison);
+	dm_bio_prison_destroy(pool->prison);
 bad_prison:
 	kfree(pool);
 bad_pool:
@@ -2272,15 +1919,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto out_flags_changed;
 	}

-	/*
-	 * The block layer requires discard_granularity to be a power of 2.
-	 */
-	if (pf.discard_enabled && !is_power_of_2(block_size)) {
-		ti->error = "Discard support must be disabled when the block size is not a power of 2";
-		r = -EINVAL;
-		goto out_flags_changed;
-	}
-
 	pt->pool = pool;
 	pt->ti = ti;
 	pt->metadata_dev = metadata_dev;
@@ -2762,6 +2400,11 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }

+static bool block_size_is_power_of_two(struct pool *pool)
+{
+	return pool->sectors_per_block_shift >= 0;
+}
+
 static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
 {
 	struct pool *pool = pt->pool;
@@ -2775,8 +2418,15 @@ static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
 	if (pt->adjusted_pf.discard_passdown) {
 		data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
 		limits->discard_granularity = data_limits->discard_granularity;
-	} else
+	} else if (block_size_is_power_of_two(pool))
 		limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
+	else
+		/*
+		 * Use largest power of 2 that is a factor of sectors_per_block
+		 * but at least DATA_DEV_BLOCK_SIZE_MIN_SECTORS.
+		 */
+		limits->discard_granularity = max(1 << (ffs(pool->sectors_per_block) - 1),
+						  DATA_DEV_BLOCK_SIZE_MIN_SECTORS) << SECTOR_SHIFT;
 }

 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
@@ -2804,7 +2454,7 @@ static struct target_type pool_target = {
 	.name = "thin-pool",
 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 		    DM_TARGET_IMMUTABLE,
-	.version = {1, 4, 0},
+	.version = {1, 5, 0},
 	.module = THIS_MODULE,
 	.ctr = pool_ctr,
 	.dtr = pool_dtr,
@@ -2979,7 +2629,7 @@ static int thin_endio(struct dm_target *ti,

 	if (h->shared_read_entry) {
 		INIT_LIST_HEAD(&work);
-		ds_dec(h->shared_read_entry, &work);
+		dm_deferred_entry_dec(h->shared_read_entry, &work);

 		spin_lock_irqsave(&pool->lock, flags);
 		list_for_each_entry_safe(m, tmp, &work, list) {
@@ -2992,7 +2642,7 @@ static int thin_endio(struct dm_target *ti,

 	if (h->all_io_entry) {
 		INIT_LIST_HEAD(&work);
-		ds_dec(h->all_io_entry, &work);
+		dm_deferred_entry_dec(h->all_io_entry, &work);
 		spin_lock_irqsave(&pool->lock, flags);
 		list_for_each_entry_safe(m, tmp, &work, list)
 			list_add(&m->list, &pool->prepared_discards);
@@ -3095,7 +2745,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)

 static struct target_type thin_target = {
 	.name = "thin",
-	.version = {1, 4, 0},
+	.version = {1, 5, 0},
 	.module	= THIS_MODULE,
 	.ctr = thin_ctr,
 	.dtr = thin_dtr,
@@ -3125,10 +2775,6 @@ static int __init dm_thin_init(void)

 	r = -ENOMEM;

-	_cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0);
-	if (!_cell_cache)
-		goto bad_cell_cache;
-
 	_new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
 	if (!_new_mapping_cache)
 		goto bad_new_mapping_cache;
@@ -3142,8 +2788,6 @@ static int __init dm_thin_init(void)
 bad_endio_hook_cache:
 	kmem_cache_destroy(_new_mapping_cache);
 bad_new_mapping_cache:
-	kmem_cache_destroy(_cell_cache);
-bad_cell_cache:
 	dm_unregister_target(&pool_target);
 bad_pool_target:
 	dm_unregister_target(&thin_target);
@@ -3156,7 +2800,6 @@ static void dm_thin_exit(void)
 	dm_unregister_target(&thin_target);
 	dm_unregister_target(&pool_target);

-	kmem_cache_destroy(_cell_cache);
 	kmem_cache_destroy(_new_mapping_cache);
 	kmem_cache_destroy(_endio_hook_cache);
 }

--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -438,7 +438,7 @@ static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io)
 		verity_hash_at_level(v, io->block, i, &hash_block_start, NULL);
 		verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL);
 		if (!i) {
-			unsigned cluster = *(volatile unsigned *)&dm_verity_prefetch_cluster;
+			unsigned cluster = ACCESS_ONCE(dm_verity_prefetch_cluster);

 			cluster >>= v->data_dev_block_bits;
 			if (unlikely(!cluster))

--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -71,6 +71,7 @@ struct dm_target_io {
 	struct dm_io *io;
 	struct dm_target *ti;
 	union map_info info;
+	struct bio clone;
 };

 /*
@@ -214,7 +215,6 @@ struct dm_md_mempools {

 #define MIN_IOS 256
 static struct kmem_cache *_io_cache;
-static struct kmem_cache *_tio_cache;
 static struct kmem_cache *_rq_tio_cache;

 /*
@@ -232,14 +232,9 @@ static int __init local_init(void)
 	if (!_io_cache)
 		return r;

-	/* allocate a slab for the target ios */
-	_tio_cache = KMEM_CACHE(dm_target_io, 0);
-	if (!_tio_cache)
-		goto out_free_io_cache;
-
 	_rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
 	if (!_rq_tio_cache)
-		goto out_free_tio_cache;
+		goto out_free_io_cache;

 	_rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
 	if (!_rq_bio_info_cache)
@@ -265,8 +260,6 @@ static int __init local_init(void)
 	kmem_cache_destroy(_rq_bio_info_cache);
 out_free_rq_tio_cache:
 	kmem_cache_destroy(_rq_tio_cache);
-out_free_tio_cache:
-	kmem_cache_destroy(_tio_cache);
 out_free_io_cache:
 	kmem_cache_destroy(_io_cache);

@@ -277,7 +270,6 @@ static void local_exit(void)
 {
 	kmem_cache_destroy(_rq_bio_info_cache);
 	kmem_cache_destroy(_rq_tio_cache);
-	kmem_cache_destroy(_tio_cache);
 	kmem_cache_destroy(_io_cache);
 	unregister_blkdev(_major, _name);
 	dm_uevent_exit();
@@ -463,7 +455,7 @@ static void free_io(struct mapped_device *md, struct dm_io *io)

 static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 {
-	mempool_free(tio, md->tio_pool);
+	bio_put(&tio->clone);
 }

 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
@@ -682,7 +674,6 @@ static void clone_endio(struct bio *bio, int error)
 	}

 	free_tio(md, tio);
-	bio_put(bio);
 	dec_pending(io, error);
 }

@@ -1002,12 +993,12 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
 }
 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);

-static void __map_bio(struct dm_target *ti, struct bio *clone,
-		      struct dm_target_io *tio)
+static void __map_bio(struct dm_target *ti, struct dm_target_io *tio)
 {
 	int r;
 	sector_t sector;
 	struct mapped_device *md;
+	struct bio *clone = &tio->clone;

 	clone->bi_end_io = clone_endio;
 	clone->bi_private = tio;
@@ -1031,7 +1022,6 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 		/* error the io and bail out, or requeue it if needed */
 		md = tio->io->md;
 		dec_pending(tio->io, r);
-		bio_put(clone);
 		free_tio(md, tio);
 	} else if (r) {
 		DMWARN("unimplemented target map return value: %d", r);
@@ -1052,14 +1042,13 @@ struct clone_info {
 /*
 * Creates a little bio that just does part of a bvec.
 */
-static struct bio *split_bvec(struct bio *bio, sector_t sector,
-			      unsigned short idx, unsigned int offset,
-			      unsigned int len, struct bio_set *bs)
+static void split_bvec(struct dm_target_io *tio, struct bio *bio,
+		       sector_t sector, unsigned short idx, unsigned int offset,
+		       unsigned int len, struct bio_set *bs)
 {
-	struct bio *clone;
+	struct bio *clone = &tio->clone;
 	struct bio_vec *bv = bio->bi_io_vec + idx;

-	clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
 	*clone->bi_io_vec = *bv;

 	clone->bi_sector = sector;
@@ -1076,20 +1065,18 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
 		bio_integrity_trim(clone,
 				   bio_sector_offset(bio, idx, offset), len);
 	}
-
-	return clone;
 }

 /*
 * Creates a bio that consists of range of complete bvecs.
 */
-static struct bio *clone_bio(struct bio *bio, sector_t sector,
-			     unsigned short idx, unsigned short bv_count,
-			     unsigned int len, struct bio_set *bs)
+static void clone_bio(struct dm_target_io *tio, struct bio *bio,
+		      sector_t sector, unsigned short idx,
+		      unsigned short bv_count, unsigned int len,
+		      struct bio_set *bs)
 {
-	struct bio *clone;
+	struct bio *clone = &tio->clone;

-	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
 	__bio_clone(clone, bio);
 	clone->bi_sector = sector;
 	clone->bi_idx = idx;
@@ -1104,14 +1091,16 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
 			bio_integrity_trim(clone,
 					   bio_sector_offset(bio, idx, 0), len);
 	}
-
-	return clone;
 }

 static struct dm_target_io *alloc_tio(struct clone_info *ci,
-				      struct dm_target *ti)
+				      struct dm_target *ti, int nr_iovecs)
 {
-	struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
+	struct dm_target_io *tio;
+	struct bio *clone;
+
+	clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, ci->md->bs);
+	tio = container_of(clone, struct dm_target_io, clone);

 	tio->io = ci->io;
 	tio->ti = ti;
@@ -1123,8 +1112,8 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci,
 static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
 				   unsigned request_nr, sector_t len)
 {
-	struct dm_target_io *tio = alloc_tio(ci, ti);
-	struct bio *clone;
+	struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs);
+	struct bio *clone = &tio->clone;

 	tio->info.target_request_nr = request_nr;

@@ -1133,14 +1122,14 @@ static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
 	 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
 	 * and discard, so no need for concern about wasted bvec allocations.
 	 */
-	clone = bio_clone_bioset(ci->bio, GFP_NOIO, ci->md->bs);

+	 __bio_clone(clone, ci->bio);
 	if (len) {
 		clone->bi_sector = ci->sector;
 		clone->bi_size = to_bytes(len);
 	}

-	__map_bio(ti, clone, tio);
+	__map_bio(ti, tio);
 }

 static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
@@ -1169,14 +1158,13 @@ static int __clone_and_map_empty_flush(struct clone_info *ci)
 */
 static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti)
 {
-	struct bio *clone, *bio = ci->bio;
+	struct bio *bio = ci->bio;
 	struct dm_target_io *tio;

-	tio = alloc_tio(ci, ti);
-	clone = clone_bio(bio, ci->sector, ci->idx,
-			  bio->bi_vcnt - ci->idx, ci->sector_count,
-			  ci->md->bs);
-	__map_bio(ti, clone, tio);
+	tio = alloc_tio(ci, ti, bio->bi_max_vecs);
+	clone_bio(tio, bio, ci->sector, ci->idx, bio->bi_vcnt - ci->idx,
+		  ci->sector_count, ci->md->bs);
+	__map_bio(ti, tio);
 	ci->sector_count = 0;
 }

@@ -1214,7 +1202,7 @@ static int __clone_and_map_discard(struct clone_info *ci)

 static int __clone_and_map(struct clone_info *ci)
 {
-	struct bio *clone, *bio = ci->bio;
+	struct bio *bio = ci->bio;
 	struct dm_target *ti;
 	sector_t len = 0, max;
 	struct dm_target_io *tio;
@@ -1254,10 +1242,10 @@ static int __clone_and_map(struct clone_info *ci)
 			len += bv_len;
 		}

-		tio = alloc_tio(ci, ti);
-		clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
-				  ci->md->bs);
-		__map_bio(ti, clone, tio);
+		tio = alloc_tio(ci, ti, bio->bi_max_vecs);
+		clone_bio(tio, bio, ci->sector, ci->idx, i - ci->idx, len,
+			  ci->md->bs);
+		__map_bio(ti, tio);

 		ci->sector += len;
 		ci->sector_count -= len;
@@ -1282,12 +1270,11 @@ static int __clone_and_map(struct clone_info *ci)

 			len = min(remaining, max);

-			tio = alloc_tio(ci, ti);
-			clone = split_bvec(bio, ci->sector, ci->idx,
-					   bv->bv_offset + offset, len,
-					   ci->md->bs);
+			tio = alloc_tio(ci, ti, 1);
+			split_bvec(tio, bio, ci->sector, ci->idx,
+				   bv->bv_offset + offset, len, ci->md->bs);

-			__map_bio(ti, clone, tio);
+			__map_bio(ti, tio);

 			ci->sector += len;
 			ci->sector_count -= len;
@@ -1955,7 +1942,7 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 {
 	struct dm_md_mempools *p;

-	if (md->io_pool && md->tio_pool && md->bs)
+	if (md->io_pool && (md->tio_pool || dm_table_get_type(t) == DM_TYPE_BIO_BASED) && md->bs)
 		/* the md already has necessary mempools */
 		goto out;

@@ -2732,14 +2719,16 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
 	if (!pools->io_pool)
 		goto free_pools_and_out;

-	pools->tio_pool = (type == DM_TYPE_BIO_BASED) ?
-			  mempool_create_slab_pool(MIN_IOS, _tio_cache) :
-			  mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
-	if (!pools->tio_pool)
-		goto free_io_pool_and_out;
+	pools->tio_pool = NULL;
+	if (type == DM_TYPE_REQUEST_BASED) {
+		pools->tio_pool = mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
+		if (!pools->tio_pool)
+			goto free_io_pool_and_out;
+	}

 	pools->bs = (type == DM_TYPE_BIO_BASED) ?
-		bioset_create(pool_size, 0) :
+		bioset_create(pool_size,
+			      offsetof(struct dm_target_io, clone)) :
 		bioset_create(pool_size,
 			      offsetof(struct dm_rq_clone_bio_info, clone));
 	if (!pools->bs)
@@ -2754,7 +2743,8 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
 	bioset_free(pools->bs);

 free_tio_pool_and_out:
-	mempool_destroy(pools->tio_pool);
+	if (pools->tio_pool)
+		mempool_destroy(pools->tio_pool);

 free_io_pool_and_out:
 	mempool_destroy(pools->io_pool);

--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -434,14 +434,14 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
 	if (ref_count && !old) {
 		*ev = SM_ALLOC;
 		ll->nr_allocated++;
-		ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) - 1);
+		le32_add_cpu(&ie_disk.nr_free, -1);
 		if (le32_to_cpu(ie_disk.none_free_before) == bit)
 			ie_disk.none_free_before = cpu_to_le32(bit + 1);

 	} else if (old && !ref_count) {
 		*ev = SM_FREE;
 		ll->nr_allocated--;
-		ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) + 1);
+		le32_add_cpu(&ie_disk.nr_free, 1);
 		ie_disk.none_free_before = cpu_to_le32(min(le32_to_cpu(ie_disk.none_free_before), bit));
 	}