Merge tag 'dm-4.1-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper updates from Mike Snitzer: - the most extensive changes this cycle are the DM core improvements to add full blk-mq support to request-based DM. - disabled by default but user can opt-in with CONFIG_DM_MQ_DEFAULT - depends on some blk-mq changes from Jens' for-4.1/core branch so that explains why this pull is built on linux-block.git - update DM to use name_to_dev_t() rather than open-coding a less capable device parser. - includes a couple small improvements to name_to_dev_t() that offer stricter constraints that DM's code provided. - improvements to the dm-cache "mq" cache replacement policy. - a DM crypt crypt_ctr() error path fix and an async crypto deadlock fix - a small efficiency improvement for DM crypt decryption by leveraging immutable biovecs - add error handling modes for corrupted blocks to DM verity - a new "log-writes" DM target from Josef Bacik that is meant for file system developers to test file system integrity at particular points in the life of a file system - a few DM log userspace cleanups and fixes - a few Documentation fixes (for thin, cache, crypt and switch) * tag 'dm-4.1-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (34 commits) dm crypt: fix missing error code return from crypt_ctr error path dm crypt: fix deadlock when async crypto algorithm returns -EBUSY dm crypt: leverage immutable biovecs when decrypting on read dm crypt: update URLs to new cryptsetup project page dm: add log writes target dm table: use bool function return values of true/false not 1/0 dm verity: add error handling modes for corrupted blocks dm thin: remove stale 'trim' message documentation dm delay: use msecs_to_jiffies for time conversion dm log userspace base: fix compile warning dm log userspace transfer: match wait_for_completion_timeout return type dm table: fall back to getting device using name_to_dev_t() init: stricter checking of major:minor root= values init: export name_to_dev_t and mark name argument as const dm: add 'use_blk_mq' module param and expose in per-device ro sysfs attr dm: optimize dm_mq_queue_rq to _not_ use kthread if using pure blk-mq dm: add full blk-mq support to request-based DM dm: impose configurable deadline for dm_request_fn's merge heuristic dm sysfs: introduce ability to add writable attributes dm: don't start current request if it would've merged with the previous ...

Merge tag 'dm-4.1-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer: - the most extensive changes this cycle are the DM core improvements to add full blk-mq support to request-based DM. - disabled by default but user can opt-in with CONFIG_DM_MQ_DEFAULT - depends on some blk-mq changes from Jens' for-4.1/core branch so that explains why this pull is built on linux-block.git - update DM to use name_to_dev_t() rather than open-coding a less capable device parser. - includes a couple small improvements to name_to_dev_t() that offer stricter constraints that DM's code provided. - improvements to the dm-cache "mq" cache replacement policy. - a DM crypt crypt_ctr() error path fix and an async crypto deadlock fix - a small efficiency improvement for DM crypt decryption by leveraging immutable biovecs - add error handling modes for corrupted blocks to DM verity - a new "log-writes" DM target from Josef Bacik that is meant for file system developers to test file system integrity at particular points in the life of a file system - a few DM log userspace cleanups and fixes - a few Documentation fixes (for thin, cache, crypt and switch) * tag 'dm-4.1-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (34 commits) dm crypt: fix missing error code return from crypt_ctr error path dm crypt: fix deadlock when async crypto algorithm returns -EBUSY dm crypt: leverage immutable biovecs when decrypting on read dm crypt: update URLs to new cryptsetup project page dm: add log writes target dm table: use bool function return values of true/false not 1/0 dm verity: add error handling modes for corrupted blocks dm thin: remove stale 'trim' message documentation dm delay: use msecs_to_jiffies for time conversion dm log userspace base: fix compile warning dm log userspace transfer: match wait_for_completion_timeout return type dm table: fall back to getting device using name_to_dev_t() init: stricter checking of major:minor root= values init: export name_to_dev_t and mark name argument as const dm: add 'use_blk_mq' module param and expose in per-device ro sysfs attr dm: optimize dm_mq_queue_rq to _not_ use kthread if using pure blk-mq dm: add full blk-mq support to request-based DM dm: impose configurable deadline for dm_request_fn's merge heuristic dm sysfs: introduce ability to add writable attributes dm: don't start current request if it would've merged with the previous ...
afad97ee · Linus Torvalds · 04b7fe6a · 44c144f9 · afad97ee · afad97ee
24 changed file
--- a/Documentation/ABI/testing/sysfs-block-dm
+++ b/Documentation/ABI/testing/sysfs-block-dm
@@ -23,3 +23,25 @@ Description:	Device-mapper device suspend state.
 		Contains the value 1 while the device is suspended.
 		Otherwise it contains 0. Read-only attribute.
 Users:		util-linux, device-mapper udev rules
+
+What:		/sys/block/dm-<num>/dm/rq_based_seq_io_merge_deadline
+Date:		March 2015
+KernelVersion:	4.1
+Contact:	dm-devel@redhat.com
+Description:	Allow control over how long a request that is a
+		reasonable merge candidate can be queued on the request
+		queue.  The resolution of this deadline is in
+		microseconds (ranging from 1 to 100000 usecs).
+		Setting this attribute to 0 (the default) will disable
+		request-based DM's merge heuristic and associated extra
+		accounting.  This attribute is not applicable to
+		bio-based DM devices so it will only ever report 0 for
+		them.
+
+What:		/sys/block/dm-<num>/dm/use_blk_mq
+Date:		March 2015
+KernelVersion:	4.1
+Contact:	dm-devel@redhat.com
+Description:	Request-based Device-mapper blk-mq I/O path mode.
+		Contains the value 1 if the device is using blk-mq.
+		Otherwise it contains 0. Read-only attribute.
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt
@@ -5,7 +5,7 @@ Device-Mapper's "crypt" target provides transparent encryption of block devices
 using the kernel crypto API.

 For a more detailed description of supported parameters see:
-http://code.google.com/p/cryptsetup/wiki/DMCrypt
+https://gitlab.com/cryptsetup/cryptsetup/wikis/DMCrypt

 Parameters: <cipher> <key> <iv_offset> <device path> \
 	      <offset> [<#opt_params> <opt_params>]
@@ -80,7 +80,7 @@ Example scripts
 ===============
 LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
 encryption with dm-crypt using the 'cryptsetup' utility, see
-http://code.google.com/p/cryptsetup/
+https://gitlab.com/cryptsetup/cryptsetup

 [[
 #!/bin/sh

--- a/Documentation/device-mapper/log-writes.txt
+++ b/Documentation/device-mapper/log-writes.txt
+dm-log-writes
+=============
+
+This target takes 2 devices, one to pass all IO to normally, and one to log all
+of the write operations to.  This is intended for file system developers wishing
+to verify the integrity of metadata or data as the file system is written to.
+There is a log_write_entry written for every WRITE request and the target is
+able to take arbitrary data from userspace to insert into the log.  The data
+that is in the WRITE requests is copied into the log to make the replay happen
+exactly as it happened originally.
+
+Log Ordering
+============
+
+We log things in order of completion once we are sure the write is no longer in
+cache.  This means that normal WRITE requests are not actually logged until the
+next REQ_FLUSH request.  This is to make it easier for userspace to replay the
+log in a way that correlates to what is on disk and not what is in cache, to
+make it easier to detect improper waiting/flushing.
+
+This works by attaching all WRITE requests to a list once the write completes.
+Once we see a REQ_FLUSH request we splice this list onto the request and once
+the FLUSH request completes we log all of the WRITEs and then the FLUSH.  Only
+completed WRITEs, at the time the REQ_FLUSH is issued, are added in order to
+simulate the worst case scenario with regard to power failures.  Consider the
+following example (W means write, C means complete):
+
+W1,W2,W3,C3,C2,Wflush,C1,Cflush
+
+The log would show the following
+
+W3,W2,flush,W1....
+
+Again this is to simulate what is actually on disk, this allows us to detect
+cases where a power failure at a particular point in time would create an
+inconsistent file system.
+
+Any REQ_FUA requests bypass this flushing mechanism and are logged as soon as
+they complete as those requests will obviously bypass the device cache.
+
+Any REQ_DISCARD requests are treated like WRITE requests.  Otherwise we would
+have all the DISCARD requests, and then the WRITE requests and then the FLUSH
+request.  Consider the following example:
+
+WRITE block 1, DISCARD block 1, FLUSH
+
+If we logged DISCARD when it completed, the replay would look like this
+
+DISCARD 1, WRITE 1, FLUSH
+
+which isn't quite what happened and wouldn't be caught during the log replay.
+
+Target interface
+================
+
+i) Constructor
+
+   log-writes <dev_path> <log_dev_path>
+
+   dev_path	: Device that all of the IO will go to normally.
+   log_dev_path : Device where the log entries are written to.
+
+ii) Status
+
+    <#logged entries> <highest allocated sector>
+
+    #logged entries	       : Number of logged entries
+    highest allocated sector   : Highest allocated sector
+
+iii) Messages
+
+    mark <description>
+
+	You can use a dmsetup message to set an arbitrary mark in a log.
+	For example say you want to fsck a file system after every
+	write, but first you need to replay up to the mkfs to make sure
+	we're fsck'ing something reasonable, you would do something like
+	this:
+
+	  mkfs.btrfs -f /dev/mapper/log
+	  dmsetup message log 0 mark mkfs
+	  <run test>
+
+	  This would allow you to replay the log up to the mkfs mark and
+	  then replay from that point on doing the fsck check in the
+	  interval that you want.
+
+	Every log has a mark at the end labeled "dm-log-writes-end".
+
+Userspace component
+===================
+
+There is a userspace tool that will replay the log for you in various ways.
+It can be found here: https://github.com/josefbacik/log-writes
+
+Example usage
+=============
+
+Say you want to test fsync on your file system.  You would do something like
+this:
+
+TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc"
+dmsetup create log --table "$TABLE"
+mkfs.btrfs -f /dev/mapper/log
+dmsetup message log 0 mark mkfs
+
+mount /dev/mapper/log /mnt/btrfs-test
+<some test that does fsync at the end>
+dmsetup message log 0 mark fsync
+md5sum /mnt/btrfs-test/foo
+umount /mnt/btrfs-test
+
+dmsetup remove log
+replay-log --log /dev/sdc --replay /dev/sdb --end-mark fsync
+mount /dev/sdb /mnt/btrfs-test
+md5sum /mnt/btrfs-test/foo
+<verify md5sum's are correct>
+
+Another option is to do a complicated file system operation and verify the file
+system is consistent during the entire operation.  You could do this with:
+
+TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc"
+dmsetup create log --table "$TABLE"
+mkfs.btrfs -f /dev/mapper/log
+dmsetup message log 0 mark mkfs
+
+mount /dev/mapper/log /mnt/btrfs-test
+<fsstress to dirty the fs>
+btrfs filesystem balance /mnt/btrfs-test
+umount /mnt/btrfs-test
+dmsetup remove log
+
+replay-log --log /dev/sdc --replay /dev/sdb --end-mark mkfs
+btrfsck /dev/sdb
+replay-log --log /dev/sdc --replay /dev/sdb --start-mark mkfs \
+	--fsck "btrfsck /dev/sdb" --check fua
+
+And that will replay the log until it sees a FUA request, run the fsck command
+and if the fsck passes it will replay to the next FUA, until it is completed or
+the fsck command exists abnormally.
--- a/Documentation/device-mapper/switch.txt
+++ b/Documentation/device-mapper/switch.txt
@@ -47,8 +47,8 @@ consume far too much memory.
 Using this device-mapper switch target we can now build a two-layer
 device hierarchy:

-    Upper Tier – Determine which array member the I/O should be sent to.
-    Lower Tier – Load balance amongst paths to a particular member.
+    Upper Tier - Determine which array member the I/O should be sent to.
+    Lower Tier - Load balance amongst paths to a particular member.

 The lower tier consists of a single dm multipath device for each member.
 Each of these multipath devices contains the set of paths directly to

--- a/Documentation/device-mapper/thin-provisioning.txt
+++ b/Documentation/device-mapper/thin-provisioning.txt
@@ -380,9 +380,6 @@ then you'll have no access to blocks mapped beyond the end.  If you
 load a target that is bigger than before, then extra blocks will be
 provisioned as and when needed.

-If you wish to reduce the size of your thin device and potentially
-regain some space then send the 'trim' message to the pool.
-
 ii) Status

     <nr mapped sectors> <highest mapped sector>

--- a/Documentation/device-mapper/verity.txt
+++ b/Documentation/device-mapper/verity.txt
@@ -11,6 +11,7 @@ Construction Parameters
    <data_block_size> <hash_block_size>
    <num_data_blocks> <hash_start_block>
    <algorithm> <digest> <salt>
+    [<#opt_params> <opt_params>]

 <version>
    This is the type of the on-disk hash format.
@@ -62,6 +63,22 @@ Construction Parameters
 <salt>
    The hexadecimal encoding of the salt value.

+<#opt_params>
+    Number of optional parameters. If there are no optional parameters,
+    the optional paramaters section can be skipped or #opt_params can be zero.
+    Otherwise #opt_params is the number of following arguments.
+
+    Example of optional parameters section:
+        1 ignore_corruption
+
+ignore_corruption
+    Log corrupted blocks, but allow read operations to proceed normally.
+
+restart_on_corruption
+    Restart the system when a corrupted block is discovered. This option is
+    not compatible with ignore_corruption and requires user space support to
+    avoid restart loops.
+
 Theory of operation
 ===================

@@ -125,7 +142,7 @@ block boundary) are the hash blocks which are stored a depth at a time

 The full specification of kernel parameters and on-disk metadata format
 is available at the cryptsetup project's wiki page
-  http://code.google.com/p/cryptsetup/wiki/DMVerity
+  https://gitlab.com/cryptsetup/cryptsetup/wikis/DMVerity

 Status
 ======
@@ -142,7 +159,7 @@ Set up a device:

 A command line tool veritysetup is available to compute or verify
 the hash tree or activate the kernel device. This is available from
-the cryptsetup upstream repository http://code.google.com/p/cryptsetup/
+the cryptsetup upstream repository https://gitlab.com/cryptsetup/cryptsetup/
 (as a libcryptsetup extension).

 Create hash on the device:

--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -196,6 +196,17 @@ config BLK_DEV_DM

 	  If unsure, say N.

+config DM_MQ_DEFAULT
+	bool "request-based DM: use blk-mq I/O path by default"
+	depends on BLK_DEV_DM
+	---help---
+	  This option enables the blk-mq based I/O path for request-based
+	  DM devices by default.  With the option the dm_mod.use_blk_mq
+	  module/boot option defaults to Y, without it to N, but it can
+	  still be overriden either way.
+
+	  If unsure say N.
+
 config DM_DEBUG
 	bool "Device mapper debugging support"
 	depends on BLK_DEV_DM
@@ -432,4 +443,20 @@ config DM_SWITCH

 	  If unsure, say N.

+config DM_LOG_WRITES
+	tristate "Log writes target support"
+	depends on BLK_DEV_DM
+	---help---
+	  This device-mapper target takes two devices, one device to use
+	  normally, one to log all write operations done to the first device.
+	  This is for use by file system developers wishing to verify that
+	  their fs is writing a consitent file system at all times by allowing
+	  them to replay the log in a variety of ways and to check the
+	  contents.
+
+	  To compile this code as a module, choose M here: the module will
+	  be called dm-log-writes.
+
+	  If unsure, say N.
+
 endif # MD
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_DM_CACHE)		+= dm-cache.o
 obj-$(CONFIG_DM_CACHE_MQ)	+= dm-cache-mq.o
 obj-$(CONFIG_DM_CACHE_CLEANER)	+= dm-cache-cleaner.o
 obj-$(CONFIG_DM_ERA)		+= dm-era.o
+obj-$(CONFIG_DM_LOG_WRITES)	+= dm-log-writes.o

 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs			+= dm-uevent.o

--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -8,6 +8,7 @@
 #include "dm.h"

 #include <linux/hash.h>
+#include <linux/jiffies.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
@@ -124,32 +125,41 @@ static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
 * sorted queue.
 */
 #define NR_QUEUE_LEVELS 16u
+#define NR_SENTINELS NR_QUEUE_LEVELS * 3
+
+#define WRITEBACK_PERIOD HZ

 struct queue {
+	unsigned nr_elts;
+	bool current_writeback_sentinels;
+	unsigned long next_writeback;
 	struct list_head qs[NR_QUEUE_LEVELS];
+	struct list_head sentinels[NR_SENTINELS];
 };

 static void queue_init(struct queue *q)
 {
 	unsigned i;

-	for (i = 0; i < NR_QUEUE_LEVELS; i++)
+	q->nr_elts = 0;
+	q->current_writeback_sentinels = false;
+	q->next_writeback = 0;
+	for (i = 0; i < NR_QUEUE_LEVELS; i++) {
 		INIT_LIST_HEAD(q->qs + i);
+		INIT_LIST_HEAD(q->sentinels + i);
+		INIT_LIST_HEAD(q->sentinels + NR_QUEUE_LEVELS + i);
+		INIT_LIST_HEAD(q->sentinels + (2 * NR_QUEUE_LEVELS) + i);
+	}
 }

-/*
- * Checks to see if the queue is empty.
- * FIXME: reduce cpu usage.
- */
-static bool queue_empty(struct queue *q)
+static unsigned queue_size(struct queue *q)
 {
-	unsigned i;
-
-	for (i = 0; i < NR_QUEUE_LEVELS; i++)
-		if (!list_empty(q->qs + i))
-			return false;
+	return q->nr_elts;
+}

-	return true;
+static bool queue_empty(struct queue *q)
+{
+	return q->nr_elts == 0;
 }

 /*
@@ -157,24 +167,19 @@ static bool queue_empty(struct queue *q)
 */
 static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
 {
+	q->nr_elts++;
 	list_add_tail(elt, q->qs + level);
 }

-static void queue_remove(struct list_head *elt)
+static void queue_remove(struct queue *q, struct list_head *elt)
 {
+	q->nr_elts--;
 	list_del(elt);
 }

-/*
- * Shifts all regions down one level.  This has no effect on the order of
- * the queue.
- */
-static void queue_shift_down(struct queue *q)
+static bool is_sentinel(struct queue *q, struct list_head *h)
 {
-	unsigned level;
-
-	for (level = 1; level < NR_QUEUE_LEVELS; level++)
-		list_splice_init(q->qs + level, q->qs + level - 1);
+	return (h >= q->sentinels) && (h < (q->sentinels + NR_SENTINELS));
 }

 /*
@@ -184,10 +189,12 @@ static void queue_shift_down(struct queue *q)
 static struct list_head *queue_peek(struct queue *q)
 {
 	unsigned level;
+	struct list_head *h;

 	for (level = 0; level < NR_QUEUE_LEVELS; level++)
-		if (!list_empty(q->qs + level))
-			return q->qs[level].next;
+		list_for_each(h, q->qs + level)
+			if (!is_sentinel(q, h))
+				return h;

 	return NULL;
 }
@@ -197,16 +204,34 @@ static struct list_head *queue_pop(struct queue *q)
 	struct list_head *r = queue_peek(q);

 	if (r) {
+		q->nr_elts--;
 		list_del(r);
-
-		/* have we just emptied the bottom level? */
-		if (list_empty(q->qs))
-			queue_shift_down(q);
 	}

 	return r;
 }

+/*
+ * Pops an entry from a level that is not past a sentinel.
+ */
+static struct list_head *queue_pop_old(struct queue *q)
+{
+	unsigned level;
+	struct list_head *h;
+
+	for (level = 0; level < NR_QUEUE_LEVELS; level++)
+		list_for_each(h, q->qs + level) {
+			if (is_sentinel(q, h))
+				break;
+
+			q->nr_elts--;
+			list_del(h);
+			return h;
+		}
+
+	return NULL;
+}
+
 static struct list_head *list_pop(struct list_head *lh)
 {
 	struct list_head *r = lh->next;
@@ -217,6 +242,62 @@ static struct list_head *list_pop(struct list_head *lh)
 	return r;
 }

+static struct list_head *writeback_sentinel(struct queue *q, unsigned level)
+{
+	if (q->current_writeback_sentinels)
+		return q->sentinels + NR_QUEUE_LEVELS + level;
+	else
+		return q->sentinels + 2 * NR_QUEUE_LEVELS + level;
+}
+
+static void queue_update_writeback_sentinels(struct queue *q)
+{
+	unsigned i;
+	struct list_head *h;
+
+	if (time_after(jiffies, q->next_writeback)) {
+		for (i = 0; i < NR_QUEUE_LEVELS; i++) {
+			h = writeback_sentinel(q, i);
+			list_del(h);
+			list_add_tail(h, q->qs + i);
+		}
+
+		q->next_writeback = jiffies + WRITEBACK_PERIOD;
+		q->current_writeback_sentinels = !q->current_writeback_sentinels;
+	}
+}
+
+/*
+ * Sometimes we want to iterate through entries that have been pushed since
+ * a certain event.  We use sentinel entries on the queues to delimit these
+ * 'tick' events.
+ */
+static void queue_tick(struct queue *q)
+{
+	unsigned i;
+
+	for (i = 0; i < NR_QUEUE_LEVELS; i++) {
+		list_del(q->sentinels + i);
+		list_add_tail(q->sentinels + i, q->qs + i);
+	}
+}
+
+typedef void (*iter_fn)(struct list_head *, void *);
+static void queue_iterate_tick(struct queue *q, iter_fn fn, void *context)
+{
+	unsigned i;
+	struct list_head *h;
+
+	for (i = 0; i < NR_QUEUE_LEVELS; i++) {
+		list_for_each_prev(h, q->qs + i) {
+			if (is_sentinel(q, h))
+				break;
+
+			fn(h, context);
+		}
+	}
+}
+
 /*----------------------------------------------------------------*/

 /*
@@ -232,8 +313,6 @@ struct entry {
 	 */
 	bool dirty:1;
 	unsigned hit_count;
-	unsigned generation;
-	unsigned tick;
 };

 /*
@@ -481,7 +560,6 @@ static bool in_cache(struct mq_policy *mq, struct entry *e)
 */
 static void push(struct mq_policy *mq, struct entry *e)
 {
-	e->tick = mq->tick;
 	hash_insert(mq, e);

 	if (in_cache(mq, e))
@@ -496,7 +574,11 @@ static void push(struct mq_policy *mq, struct entry *e)
 */
 static void del(struct mq_policy *mq, struct entry *e)
 {
-	queue_remove(&e->list);
+	if (in_cache(mq, e))
+		queue_remove(e->dirty ? &mq->cache_dirty : &mq->cache_clean, &e->list);
+	else
+		queue_remove(&mq->pre_cache, &e->list);
+
 	hash_remove(e);
 }

@@ -518,18 +600,24 @@ static struct entry *pop(struct mq_policy *mq, struct queue *q)
 	return e;
 }

-static struct entry *peek(struct queue *q)
+static struct entry *pop_old(struct mq_policy *mq, struct queue *q)
 {
-	struct list_head *h = queue_peek(q);
-	return h ? container_of(h, struct entry, list) : NULL;
+	struct entry *e;
+	struct list_head *h = queue_pop_old(q);
+
+	if (!h)
+		return NULL;
+
+	e = container_of(h, struct entry, list);
+	hash_remove(e);
+
+	return e;
 }

-/*
- * Has this entry already been updated?
- */
-static bool updated_this_tick(struct mq_policy *mq, struct entry *e)
+static struct entry *peek(struct queue *q)
 {
-	return mq->tick == e->tick;
+	struct list_head *h = queue_peek(q);
+	return h ? container_of(h, struct entry, list) : NULL;
 }

 /*
@@ -583,20 +671,9 @@ static void check_generation(struct mq_policy *mq)
 * Whenever we use an entry we bump up it's hit counter, and push it to the
 * back to it's current level.
 */
-static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e)
+static void requeue(struct mq_policy *mq, struct entry *e)
 {
-	if (updated_this_tick(mq, e))
-		return;
-
-	e->hit_count++;
-	mq->hit_count++;
 	check_generation(mq);
-
-	/* generation adjustment, to stop the counts increasing forever. */
-	/* FIXME: divide? */
-	/* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */
-	e->generation = mq->generation;
-
 	del(mq, e);
 	push(mq, e);
 }
@@ -703,7 +780,7 @@ static int cache_entry_found(struct mq_policy *mq,
 			     struct entry *e,
 			     struct policy_result *result)
 {
-	requeue_and_update_tick(mq, e);
+	requeue(mq, e);

 	if (in_cache(mq, e)) {
 		result->op = POLICY_HIT;
@@ -740,8 +817,6 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
 	new_e->oblock = e->oblock;
 	new_e->dirty = false;
 	new_e->hit_count = e->hit_count;
-	new_e->generation = e->generation;
-	new_e->tick = e->tick;

 	del(mq, e);
 	free_entry(&mq->pre_cache_pool, e);
@@ -757,18 +832,16 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
 				 int data_dir, struct policy_result *result)
 {
 	int r = 0;
-	bool updated = updated_this_tick(mq, e);

-	if ((!discarded_oblock && updated) ||
-	    !should_promote(mq, e, discarded_oblock, data_dir)) {
-		requeue_and_update_tick(mq, e);
+	if (!should_promote(mq, e, discarded_oblock, data_dir)) {
+		requeue(mq, e);
 		result->op = POLICY_MISS;

 	} else if (!can_migrate)
 		r = -EWOULDBLOCK;

 	else {
-		requeue_and_update_tick(mq, e);
+		requeue(mq, e);
 		r = pre_cache_to_cache(mq, e, result);
 	}

@@ -795,7 +868,6 @@ static void insert_in_pre_cache(struct mq_policy *mq,
 	e->dirty = false;
 	e->oblock = oblock;
 	e->hit_count = 1;
-	e->generation = mq->generation;
 	push(mq, e);
 }

@@ -828,7 +900,6 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
 	e->oblock = oblock;
 	e->dirty = false;
 	e->hit_count = 1;
-	e->generation = mq->generation;
 	push(mq, e);

 	result->cblock = infer_cblock(&mq->cache_pool, e);
@@ -905,12 +976,37 @@ static void mq_destroy(struct dm_cache_policy *p)
 	kfree(mq);
 }

+static void update_pre_cache_hits(struct list_head *h, void *context)
+{
+	struct entry *e = container_of(h, struct entry, list);
+	e->hit_count++;
+}
+
+static void update_cache_hits(struct list_head *h, void *context)
+{
+	struct mq_policy *mq = context;
+	struct entry *e = container_of(h, struct entry, list);
+	e->hit_count++;
+	mq->hit_count++;
+}
+
 static void copy_tick(struct mq_policy *mq)
 {
-	unsigned long flags;
+	unsigned long flags, tick;

 	spin_lock_irqsave(&mq->tick_lock, flags);
-	mq->tick = mq->tick_protected;
+	tick = mq->tick_protected;
+	if (tick != mq->tick) {
+		queue_iterate_tick(&mq->pre_cache, update_pre_cache_hits, mq);
+		queue_iterate_tick(&mq->cache_dirty, update_cache_hits, mq);
+		queue_iterate_tick(&mq->cache_clean, update_cache_hits, mq);
+		mq->tick = tick;
+	}
+
+	queue_tick(&mq->pre_cache);
+	queue_tick(&mq->cache_dirty);
+	queue_tick(&mq->cache_clean);
+	queue_update_writeback_sentinels(&mq->cache_dirty);
 	spin_unlock_irqrestore(&mq->tick_lock, flags);
 }

@@ -1001,7 +1097,6 @@ static int mq_load_mapping(struct dm_cache_policy *p,
 	e->oblock = oblock;
 	e->dirty = false;	/* this gets corrected in a minute */
 	e->hit_count = hint_valid ? hint : 1;
-	e->generation = mq->generation;
 	push(mq, e);

 	return 0;
@@ -1012,10 +1107,15 @@ static int mq_save_hints(struct mq_policy *mq, struct queue *q,
 {
 	int r;
 	unsigned level;
+	struct list_head *h;
 	struct entry *e;

 	for (level = 0; level < NR_QUEUE_LEVELS; level++)
-		list_for_each_entry(e, q->qs + level, list) {
+		list_for_each(h, q->qs + level) {
+			if (is_sentinel(q, h))
+				continue;
+
+			e = container_of(h, struct entry, list);
 			r = fn(context, infer_cblock(&mq->cache_pool, e),
 			       e->oblock, e->hit_count);
 			if (r)
@@ -1087,10 +1187,27 @@ static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
 	return r;
 }

+#define CLEAN_TARGET_PERCENTAGE 25
+
+static bool clean_target_met(struct mq_policy *mq)
+{
+	/*
+	 * Cache entries may not be populated.  So we're cannot rely on the
+	 * size of the clean queue.
+	 */
+	unsigned nr_clean = from_cblock(mq->cache_size) - queue_size(&mq->cache_dirty);
+	unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_PERCENTAGE / 100;
+
+	return nr_clean >= target;
+}
+
 static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
 			      dm_cblock_t *cblock)
 {
-	struct entry *e = pop(mq, &mq->cache_dirty);
+	struct entry *e = pop_old(mq, &mq->cache_dirty);
+
+	if (!e && !clean_target_met(mq))
+		e = pop(mq, &mq->cache_dirty);

 	if (!e)
 		return -ENODATA;

--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -228,7 +228,7 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
 *
 * tcw:  Compatible implementation of the block chaining mode used
 *       by the TrueCrypt device encryption system (prior to version 4.1).
- *       For more info see: http://www.truecrypt.org
+ *       For more info see: https://gitlab.com/cryptsetup/cryptsetup/wikis/TrueCryptOnDiskFormat
 *       It operates on full 512 byte sectors and uses CBC
 *       with an IV derived from initial key and the sector number.
 *       In addition, whitening value is applied on every sector, whitening
@@ -925,11 +925,10 @@ static int crypt_convert(struct crypt_config *cc,

 		switch (r) {
 		/* async */
+		case -EINPROGRESS:
 		case -EBUSY:
 			wait_for_completion(&ctx->restart);
 			reinit_completion(&ctx->restart);
-			/* fall through*/
-		case -EINPROGRESS:
 			ctx->req = NULL;
 			ctx->cc_sector++;
 			continue;
@@ -1124,15 +1123,15 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
 static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 {
 	struct crypt_config *cc = io->cc;
-	struct bio *base_bio = io->base_bio;
 	struct bio *clone;

 	/*
-	 * The block layer might modify the bvec array, so always
-	 * copy the required bvecs because we need the original
-	 * one in order to decrypt the whole bio data *afterwards*.
+	 * We need the original biovec array in order to decrypt
+	 * the whole bio data *afterwards* -- thanks to immutable
+	 * biovecs we don't need to worry about the block layer
+	 * modifying the biovec array; so leverage bio_clone_fast().
 	 */
-	clone = bio_clone_bioset(base_bio, gfp, cc->bs);
+	clone = bio_clone_fast(io->base_bio, gfp, cc->bs);
 	if (!clone)
 		return 1;

@@ -1346,10 +1345,8 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 	struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
 	struct crypt_config *cc = io->cc;

-	if (error == -EINPROGRESS) {
-		complete(&ctx->restart);
+	if (error == -EINPROGRESS)
 		return;
-	}

 	if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
 		error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
@@ -1360,12 +1357,15 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 	crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);

 	if (!atomic_dec_and_test(&ctx->cc_pending))
-		return;
+		goto done;

 	if (bio_data_dir(io->base_bio) == READ)
 		kcryptd_crypt_read_done(io);
 	else
 		kcryptd_crypt_write_io_submit(io, 1);
+done:
+	if (!completion_done(&ctx->restart))
+		complete(&ctx->restart);
 }

 static void kcryptd_crypt(struct work_struct *work)
@@ -1816,6 +1816,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		if (ret)
 			goto bad;

+		ret = -EINVAL;
 		while (opt_params--) {
 			opt_string = dm_shift_arg(&as);
 			if (!opt_string) {

--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -236,7 +236,7 @@ static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
 	delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));

 	delayed->context = dc;
-	delayed->expires = expires = jiffies + (delay * HZ / 1000);
+	delayed->expires = expires = jiffies + msecs_to_jiffies(delay);

 	mutex_lock(&delayed_bios_lock);


--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -17,7 +17,9 @@

 #define DM_LOG_USERSPACE_VSN "1.3.0"

-struct flush_entry {
+#define FLUSH_ENTRY_POOL_SIZE 16
+
+struct dm_dirty_log_flush_entry {
 	int type;
 	region_t region;
 	struct list_head list;
@@ -34,22 +36,14 @@ struct flush_entry {
 struct log_c {
 	struct dm_target *ti;
 	struct dm_dev *log_dev;
-	uint32_t region_size;
-	region_t region_count;
-	uint64_t luid;
-	char uuid[DM_UUID_LEN];

 	char *usr_argv_str;
 	uint32_t usr_argc;

-	/*
-	 * in_sync_hint gets set when doing is_remote_recovering.  It
-	 * represents the first region that needs recovery.  IOW, the
-	 * first zero bit of sync_bits.  This can be useful for to limit
-	 * traffic for calls like is_remote_recovering and get_resync_work,
-	 * but be take care in its use for anything else.
-	 */
-	uint64_t in_sync_hint;
+	uint32_t region_size;
+	region_t region_count;
+	uint64_t luid;
+	char uuid[DM_UUID_LEN];

 	/*
 	 * Mark and clear requests are held until a flush is issued
@@ -61,6 +55,15 @@ struct log_c {
 	struct list_head mark_list;
 	struct list_head clear_list;

+	/*
+	 * in_sync_hint gets set when doing is_remote_recovering.  It
+	 * represents the first region that needs recovery.  IOW, the
+	 * first zero bit of sync_bits.  This can be useful for to limit
+	 * traffic for calls like is_remote_recovering and get_resync_work,
+	 * but be take care in its use for anything else.
+	 */
+	uint64_t in_sync_hint;
+
 	/*
 	 * Workqueue for flush of clear region requests.
 	 */
@@ -72,19 +75,11 @@ struct log_c {
 	 * Combine userspace flush and mark requests for efficiency.
 	 */
 	uint32_t integrated_flush;
-};
-
-static mempool_t *flush_entry_pool;

-static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data)
-{
-	return kmalloc(sizeof(struct flush_entry), gfp_mask);
-}
+	mempool_t *flush_entry_pool;
+};

-static void flush_entry_free(void *element, void *pool_data)
-{
-	kfree(element);
-}
+static struct kmem_cache *_flush_entry_cache;

 static int userspace_do_request(struct log_c *lc, const char *uuid,
 				int request_type, char *data, size_t data_size,
@@ -254,6 +249,14 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 		goto out;
 	}

+	lc->flush_entry_pool = mempool_create_slab_pool(FLUSH_ENTRY_POOL_SIZE,
+							_flush_entry_cache);
+	if (!lc->flush_entry_pool) {
+		DMERR("Failed to create flush_entry_pool");
+		r = -ENOMEM;
+		goto out;
+	}
+
 	/*
 	 * Send table string and get back any opened device.
 	 */
@@ -310,6 +313,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 out:
 	kfree(devices_rdata);
 	if (r) {
+		if (lc->flush_entry_pool)
+			mempool_destroy(lc->flush_entry_pool);
 		kfree(lc);
 		kfree(ctr_str);
 	} else {
@@ -338,6 +343,8 @@ static void userspace_dtr(struct dm_dirty_log *log)
 	if (lc->log_dev)
 		dm_put_device(lc->ti, lc->log_dev);

+	mempool_destroy(lc->flush_entry_pool);
+
 	kfree(lc->usr_argv_str);
 	kfree(lc);

@@ -461,7 +468,7 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
 static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
 {
 	int r = 0;
-	struct flush_entry *fe;
+	struct dm_dirty_log_flush_entry *fe;

 	list_for_each_entry(fe, flush_list, list) {
 		r = userspace_do_request(lc, lc->uuid, fe->type,
@@ -481,7 +488,7 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list,
 	int r = 0;
 	int count;
 	uint32_t type = 0;
-	struct flush_entry *fe, *tmp_fe;
+	struct dm_dirty_log_flush_entry *fe, *tmp_fe;
 	LIST_HEAD(tmp_list);
 	uint64_t group[MAX_FLUSH_GROUP_COUNT];

@@ -563,7 +570,8 @@ static int userspace_flush(struct dm_dirty_log *log)
 	LIST_HEAD(clear_list);
 	int mark_list_is_empty;
 	int clear_list_is_empty;
-	struct flush_entry *fe, *tmp_fe;
+	struct dm_dirty_log_flush_entry *fe, *tmp_fe;
+	mempool_t *flush_entry_pool = lc->flush_entry_pool;

 	spin_lock_irqsave(&lc->flush_lock, flags);
 	list_splice_init(&lc->mark_list, &mark_list);
@@ -643,10 +651,10 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
 {
 	unsigned long flags;
 	struct log_c *lc = log->context;
-	struct flush_entry *fe;
+	struct dm_dirty_log_flush_entry *fe;

 	/* Wait for an allocation, but _never_ fail */
-	fe = mempool_alloc(flush_entry_pool, GFP_NOIO);
+	fe = mempool_alloc(lc->flush_entry_pool, GFP_NOIO);
 	BUG_ON(!fe);

 	spin_lock_irqsave(&lc->flush_lock, flags);
@@ -672,7 +680,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
 {
 	unsigned long flags;
 	struct log_c *lc = log->context;
-	struct flush_entry *fe;
+	struct dm_dirty_log_flush_entry *fe;

 	/*
 	 * If we fail to allocate, we skip the clearing of
@@ -680,7 +688,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
 	 * to cause the region to be resync'ed when the
 	 * device is activated next time.
 	 */
-	fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC);
+	fe = mempool_alloc(lc->flush_entry_pool, GFP_ATOMIC);
 	if (!fe) {
 		DMERR("Failed to allocate memory to clear region.");
 		return;
@@ -733,7 +741,6 @@ static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
 static void userspace_set_region_sync(struct dm_dirty_log *log,
 				      region_t region, int in_sync)
 {
-	int r;
 	struct log_c *lc = log->context;
 	struct {
 		region_t r;
@@ -743,12 +750,12 @@ static void userspace_set_region_sync(struct dm_dirty_log *log,
 	pkg.r = region;
 	pkg.i = (int64_t)in_sync;

-	r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
-				 (char *)&pkg, sizeof(pkg), NULL, NULL);
+	(void) userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
+				    (char *)&pkg, sizeof(pkg), NULL, NULL);

 	/*
 	 * It would be nice to be able to report failures.
-	 * However, it is easy emough to detect and resolve.
+	 * However, it is easy enough to detect and resolve.
 	 */
 	return;
 }
@@ -886,18 +893,16 @@ static int __init userspace_dirty_log_init(void)
 {
 	int r = 0;

-	flush_entry_pool = mempool_create(100, flush_entry_alloc,
-					  flush_entry_free, NULL);
-
-	if (!flush_entry_pool) {
-		DMWARN("Unable to create flush_entry_pool:  No memory.");
+	_flush_entry_cache = KMEM_CACHE(dm_dirty_log_flush_entry, 0);
+	if (!_flush_entry_cache) {
+		DMWARN("Unable to create flush_entry_cache: No memory.");
 		return -ENOMEM;
 	}

 	r = dm_ulog_tfr_init();
 	if (r) {
 		DMWARN("Unable to initialize userspace log communications");
-		mempool_destroy(flush_entry_pool);
+		kmem_cache_destroy(_flush_entry_cache);
 		return r;
 	}

@@ -905,7 +910,7 @@ static int __init userspace_dirty_log_init(void)
 	if (r) {
 		DMWARN("Couldn't register userspace dirty log type");
 		dm_ulog_tfr_exit();
-		mempool_destroy(flush_entry_pool);
+		kmem_cache_destroy(_flush_entry_cache);
 		return r;
 	}

@@ -917,7 +922,7 @@ static void __exit userspace_dirty_log_exit(void)
 {
 	dm_dirty_log_type_unregister(&_userspace_type);
 	dm_ulog_tfr_exit();
-	mempool_destroy(flush_entry_pool);
+	kmem_cache_destroy(_flush_entry_cache);

 	DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded");
 	return;

--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -172,6 +172,7 @@ int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type,
 			 char *rdata, size_t *rdata_size)
 {
 	int r = 0;
+	unsigned long tmo;
 	size_t dummy = 0;
 	int overhead_size = sizeof(struct dm_ulog_request) + sizeof(struct cn_msg);
 	struct dm_ulog_request *tfr = prealloced_ulog_tfr;
@@ -236,11 +237,11 @@ int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type,
 		goto out;
 	}

-	r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
+	tmo = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
 	spin_lock(&receiving_list_lock);
 	list_del_init(&(pkg.list));
 	spin_unlock(&receiving_list_lock);
-	if (!r) {
+	if (!tmo) {
 		DMWARN("[%s] Request timed out: [%u/%u] - retrying",
 		       (strlen(uuid) > 8) ?
 		       (uuid + (strlen(uuid) - 8)) : (uuid),

--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -428,7 +428,7 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 	} else {
 		/* blk-mq request-based interface */
 		*__clone = blk_get_request(bdev_get_queue(bdev),
-					   rq_data_dir(rq), GFP_KERNEL);
+					   rq_data_dir(rq), GFP_ATOMIC);
 		if (IS_ERR(*__clone))
 			/* ENOMEM, requeue */
 			return r;
@@ -1627,7 +1627,7 @@ static int __pgpath_busy(struct pgpath *pgpath)
 {
 	struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);

-	return dm_underlying_device_busy(q);
+	return blk_lld_busy(q);
 }

 /*
@@ -1703,7 +1703,7 @@ static int multipath_busy(struct dm_target *ti)
 *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
 	.name = "multipath",
-	.version = {1, 8, 0},
+	.version = {1, 9, 0},
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
 	.dtr = multipath_dtr,

--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -11,7 +11,7 @@
 struct dm_sysfs_attr {
 	struct attribute attr;
 	ssize_t (*show)(struct mapped_device *, char *);
-	ssize_t (*store)(struct mapped_device *, char *);
+	ssize_t (*store)(struct mapped_device *, const char *, size_t count);
 };

 #define DM_ATTR_RO(_name) \
@@ -39,6 +39,31 @@ static ssize_t dm_attr_show(struct kobject *kobj, struct attribute *attr,
 	return ret;
 }

+#define DM_ATTR_RW(_name) \
+struct dm_sysfs_attr dm_attr_##_name = \
+	__ATTR(_name, S_IRUGO | S_IWUSR, dm_attr_##_name##_show, dm_attr_##_name##_store)
+
+static ssize_t dm_attr_store(struct kobject *kobj, struct attribute *attr,
+			     const char *page, size_t count)
+{
+	struct dm_sysfs_attr *dm_attr;
+	struct mapped_device *md;
+	ssize_t ret;
+
+	dm_attr = container_of(attr, struct dm_sysfs_attr, attr);
+	if (!dm_attr->store)
+		return -EIO;
+
+	md = dm_get_from_kobject(kobj);
+	if (!md)
+		return -EINVAL;
+
+	ret = dm_attr->store(md, page, count);
+	dm_put(md);
+
+	return ret;
+}
+
 static ssize_t dm_attr_name_show(struct mapped_device *md, char *buf)
 {
 	if (dm_copy_name_and_uuid(md, buf, NULL))
@@ -64,25 +89,33 @@ static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
 	return strlen(buf);
 }

+static ssize_t dm_attr_use_blk_mq_show(struct mapped_device *md, char *buf)
+{
+	sprintf(buf, "%d\n", dm_use_blk_mq(md));
+
+	return strlen(buf);
+}
+
 static DM_ATTR_RO(name);
 static DM_ATTR_RO(uuid);
 static DM_ATTR_RO(suspended);
+static DM_ATTR_RO(use_blk_mq);
+static DM_ATTR_RW(rq_based_seq_io_merge_deadline);

 static struct attribute *dm_attrs[] = {
 	&dm_attr_name.attr,
 	&dm_attr_uuid.attr,
 	&dm_attr_suspended.attr,
+	&dm_attr_use_blk_mq.attr,
+	&dm_attr_rq_based_seq_io_merge_deadline.attr,
 	NULL,
 };

 static const struct sysfs_ops dm_sysfs_ops = {
 	.show	= dm_attr_show,
+	.store	= dm_attr_store,
 };

-/*
- * dm kobject is embedded in mapped_device structure
- * no need to define release function here
- */
 static struct kobj_type dm_ktype = {
 	.sysfs_ops	= &dm_sysfs_ops,
 	.default_attrs	= dm_attrs,

--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -18,6 +18,8 @@
 #include <linux/mutex.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
+#include <linux/blk-mq.h>
+#include <linux/mount.h>

 #define DM_MSG_PREFIX "table"

@@ -372,23 +374,18 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
 	int r;
 	dev_t uninitialized_var(dev);
 	struct dm_dev_internal *dd;
-	unsigned int major, minor;
 	struct dm_table *t = ti->table;
-	char dummy;
+	struct block_device *bdev;

 	BUG_ON(!t);

-	if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) {
-		/* Extract the major/minor numbers */
-		dev = MKDEV(major, minor);
-		if (MAJOR(dev) != major || MINOR(dev) != minor)
-			return -EOVERFLOW;
+	/* convert the path to a device */
+	bdev = lookup_bdev(path);
+	if (IS_ERR(bdev)) {
+		dev = name_to_dev_t(path);
+		if (!dev)
+			return -ENODEV;
 	} else {
-		/* convert the path to a device */
-		struct block_device *bdev = lookup_bdev(path);
-
-		if (IS_ERR(bdev))
-			return PTR_ERR(bdev);
 		dev = bdev->bd_dev;
 		bdput(bdev);
 	}
@@ -939,7 +936,7 @@ bool dm_table_mq_request_based(struct dm_table *t)
 	return dm_table_get_type(t) == DM_TYPE_MQ_REQUEST_BASED;
 }

-static int dm_table_alloc_md_mempools(struct dm_table *t)
+static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
 {
 	unsigned type = dm_table_get_type(t);
 	unsigned per_bio_data_size = 0;
@@ -957,7 +954,7 @@ static int dm_table_alloc_md_mempools(struct dm_table *t)
 			per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size);
 		}

-	t->mempools = dm_alloc_md_mempools(type, t->integrity_supported, per_bio_data_size);
+	t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size);
 	if (!t->mempools)
 		return -ENOMEM;

@@ -1127,7 +1124,7 @@ int dm_table_complete(struct dm_table *t)
 		return r;
 	}

-	r = dm_table_alloc_md_mempools(t);
+	r = dm_table_alloc_md_mempools(t, t->md);
 	if (r)
 		DMERR("unable to allocate mempools");

@@ -1339,14 +1336,14 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
 			continue;

 		if (ti->flush_supported)
-			return 1;
+			return true;

 		if (ti->type->iterate_devices &&
 		    ti->type->iterate_devices(ti, device_flush_capable, &flush))
-			return 1;
+			return true;
 	}

-	return 0;
+	return false;
 }

 static bool dm_table_discard_zeroes_data(struct dm_table *t)
@@ -1359,10 +1356,10 @@ static bool dm_table_discard_zeroes_data(struct dm_table *t)
 		ti = dm_table_get_target(t, i++);

 		if (ti->discard_zeroes_data_unsupported)
-			return 0;
+			return false;
 	}

-	return 1;
+	return true;
 }

 static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
@@ -1408,10 +1405,10 @@ static bool dm_table_all_devices_attribute(struct dm_table *t,

 		if (!ti->type->iterate_devices ||
 		    !ti->type->iterate_devices(ti, func, NULL))
-			return 0;
+			return false;
 	}

-	return 1;
+	return true;
 }

 static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev,
@@ -1468,14 +1465,14 @@ static bool dm_table_supports_discards(struct dm_table *t)
 			continue;

 		if (ti->discards_supported)
-			return 1;
+			return true;

 		if (ti->type->iterate_devices &&
 		    ti->type->iterate_devices(ti, device_discard_capable, NULL))
-			return 1;
+			return true;
 	}

-	return 0;
+	return false;
 }

 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
@@ -1677,20 +1674,6 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 	return r;
 }

-int dm_table_any_busy_target(struct dm_table *t)
-{
-	unsigned i;
-	struct dm_target *ti;
-
-	for (i = 0; i < t->num_targets; i++) {
-		ti = t->targets + i;
-		if (ti->type->busy && ti->type->busy(ti))
-			return 1;
-	}
-
-	return 0;
-}
-
 struct mapped_device *dm_table_get_md(struct dm_table *t)
 {
 	return t->md;
@@ -1709,9 +1692,13 @@ void dm_table_run_md_queue_async(struct dm_table *t)
 	md = dm_table_get_md(t);
 	queue = dm_get_md_queue(md);
 	if (queue) {
-		spin_lock_irqsave(queue->queue_lock, flags);
-		blk_run_queue_async(queue);
-		spin_unlock_irqrestore(queue->queue_lock, flags);
+		if (queue->mq_ops)
+			blk_mq_run_hw_queues(queue, true);
+		else {
+			spin_lock_irqsave(queue->queue_lock, flags);
+			blk_run_queue_async(queue);
+			spin_unlock_irqrestore(queue->queue_lock, flags);
+		}
 	}
 }
 EXPORT_SYMBOL(dm_table_run_md_queue_async);

--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -18,20 +18,39 @@

 #include <linux/module.h>
 #include <linux/device-mapper.h>
+#include <linux/reboot.h>
 #include <crypto/hash.h>

 #define DM_MSG_PREFIX			"verity"

+#define DM_VERITY_ENV_LENGTH		42
+#define DM_VERITY_ENV_VAR_NAME		"DM_VERITY_ERR_BLOCK_NR"
+
 #define DM_VERITY_IO_VEC_INLINE		16
 #define DM_VERITY_MEMPOOL_SIZE		4
 #define DM_VERITY_DEFAULT_PREFETCH_SIZE	262144

 #define DM_VERITY_MAX_LEVELS		63
+#define DM_VERITY_MAX_CORRUPTED_ERRS	100
+
+#define DM_VERITY_OPT_LOGGING		"ignore_corruption"
+#define DM_VERITY_OPT_RESTART		"restart_on_corruption"

 static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;

 module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);

+enum verity_mode {
+	DM_VERITY_MODE_EIO,
+	DM_VERITY_MODE_LOGGING,
+	DM_VERITY_MODE_RESTART
+};
+
+enum verity_block_type {
+	DM_VERITY_BLOCK_TYPE_DATA,
+	DM_VERITY_BLOCK_TYPE_METADATA
+};
+
 struct dm_verity {
 	struct dm_dev *data_dev;
 	struct dm_dev *hash_dev;
@@ -54,6 +73,8 @@ struct dm_verity {
 	unsigned digest_size;	/* digest size for the current hash algorithm */
 	unsigned shash_descsize;/* the size of temporary space for crypto */
 	int hash_failed;	/* set to 1 if hash of any block failed */
+	enum verity_mode mode;	/* mode for handling verification errors */
+	unsigned corrupted_errs;/* Number of errors for corrupted blocks */

 	mempool_t *vec_mempool;	/* mempool of bio vector */

@@ -174,6 +195,57 @@ static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
 		*offset = idx << (v->hash_dev_block_bits - v->hash_per_block_bits);
 }

+/*
+ * Handle verification errors.
+ */
+static int verity_handle_err(struct dm_verity *v, enum verity_block_type type,
+			     unsigned long long block)
+{
+	char verity_env[DM_VERITY_ENV_LENGTH];
+	char *envp[] = { verity_env, NULL };
+	const char *type_str = "";
+	struct mapped_device *md = dm_table_get_md(v->ti->table);
+
+	/* Corruption should be visible in device status in all modes */
+	v->hash_failed = 1;
+
+	if (v->corrupted_errs >= DM_VERITY_MAX_CORRUPTED_ERRS)
+		goto out;
+
+	v->corrupted_errs++;
+
+	switch (type) {
+	case DM_VERITY_BLOCK_TYPE_DATA:
+		type_str = "data";
+		break;
+	case DM_VERITY_BLOCK_TYPE_METADATA:
+		type_str = "metadata";
+		break;
+	default:
+		BUG();
+	}
+
+	DMERR("%s: %s block %llu is corrupted", v->data_dev->name, type_str,
+		block);
+
+	if (v->corrupted_errs == DM_VERITY_MAX_CORRUPTED_ERRS)
+		DMERR("%s: reached maximum errors", v->data_dev->name);
+
+	snprintf(verity_env, DM_VERITY_ENV_LENGTH, "%s=%d,%llu",
+		DM_VERITY_ENV_VAR_NAME, type, block);
+
+	kobject_uevent_env(&disk_to_dev(dm_disk(md))->kobj, KOBJ_CHANGE, envp);
+
+out:
+	if (v->mode == DM_VERITY_MODE_LOGGING)
+		return 0;
+
+	if (v->mode == DM_VERITY_MODE_RESTART)
+		kernel_restart("dm-verity device corrupted");
+
+	return 1;
+}
+
 /*
 * Verify hash of a metadata block pertaining to the specified data block
 * ("block" argument) at a specified level ("level" argument).
@@ -251,11 +323,11 @@ static int verity_verify_level(struct dm_verity_io *io, sector_t block,
 			goto release_ret_r;
 		}
 		if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
-			DMERR_LIMIT("metadata block %llu is corrupted",
-				(unsigned long long)hash_block);
-			v->hash_failed = 1;
-			r = -EIO;
-			goto release_ret_r;
+			if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_METADATA,
+					      hash_block)) {
+				r = -EIO;
+				goto release_ret_r;
+			}
 		} else
 			aux->hash_verified = 1;
 	}
@@ -367,10 +439,9 @@ static int verity_verify_io(struct dm_verity_io *io)
 			return r;
 		}
 		if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
-			DMERR_LIMIT("data block %llu is corrupted",
-				(unsigned long long)(io->block + b));
-			v->hash_failed = 1;
-			return -EIO;
+			if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
+					      io->block + b))
+				return -EIO;
 		}
 	}

@@ -546,6 +617,19 @@ static void verity_status(struct dm_target *ti, status_type_t type,
 		else
 			for (x = 0; x < v->salt_size; x++)
 				DMEMIT("%02x", v->salt[x]);
+		if (v->mode != DM_VERITY_MODE_EIO) {
+			DMEMIT(" 1 ");
+			switch (v->mode) {
+			case DM_VERITY_MODE_LOGGING:
+				DMEMIT(DM_VERITY_OPT_LOGGING);
+				break;
+			case DM_VERITY_MODE_RESTART:
+				DMEMIT(DM_VERITY_OPT_RESTART);
+				break;
+			default:
+				BUG();
+			}
+		}
 		break;
 	}
 }
@@ -647,13 +731,19 @@ static void verity_dtr(struct dm_target *ti)
 static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
 	struct dm_verity *v;
-	unsigned num;
+	struct dm_arg_set as;
+	const char *opt_string;
+	unsigned int num, opt_params;
 	unsigned long long num_ll;
 	int r;
 	int i;
 	sector_t hash_position;
 	char dummy;

+	static struct dm_arg _args[] = {
+		{0, 1, "Invalid number of feature args"},
+	};
+
 	v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
 	if (!v) {
 		ti->error = "Cannot allocate verity structure";
@@ -668,8 +758,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto bad;
 	}

-	if (argc != 10) {
-		ti->error = "Invalid argument count: exactly 10 arguments required";
+	if (argc < 10) {
+		ti->error = "Not enough arguments";
 		r = -EINVAL;
 		goto bad;
 	}
@@ -790,6 +880,39 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		}
 	}

+	argv += 10;
+	argc -= 10;
+
+	/* Optional parameters */
+	if (argc) {
+		as.argc = argc;
+		as.argv = argv;
+
+		r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
+		if (r)
+			goto bad;
+
+		while (opt_params) {
+			opt_params--;
+			opt_string = dm_shift_arg(&as);
+			if (!opt_string) {
+				ti->error = "Not enough feature arguments";
+				r = -EINVAL;
+				goto bad;
+			}
+
+			if (!strcasecmp(opt_string, DM_VERITY_OPT_LOGGING))
+				v->mode = DM_VERITY_MODE_LOGGING;
+			else if (!strcasecmp(opt_string, DM_VERITY_OPT_RESTART))
+				v->mode = DM_VERITY_MODE_RESTART;
+			else {
+				ti->error = "Invalid feature arguments";
+				r = -EINVAL;
+				goto bad;
+			}
+		}
+	}
+
 	v->hash_per_block_bits =
 		__fls((1 << v->hash_dev_block_bits) / v->digest_size);


--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -70,7 +70,6 @@ void dm_table_presuspend_undo_targets(struct dm_table *t);
 void dm_table_postsuspend_targets(struct dm_table *t);
 int dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
-int dm_table_any_busy_target(struct dm_table *t);
 unsigned dm_table_get_type(struct dm_table *t);
 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
 bool dm_table_request_based(struct dm_table *t);
@@ -212,6 +211,8 @@ int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
 void dm_internal_suspend(struct mapped_device *md);
 void dm_internal_resume(struct mapped_device *md);

+bool dm_use_blk_mq(struct mapped_device *md);
+
 int dm_io_init(void);
 void dm_io_exit(void);

@@ -221,7 +222,8 @@ void dm_kcopyd_exit(void);
 /*
 * Mempool operations
 */
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size);
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+					    unsigned integrity, unsigned per_bio_data_size);
 void dm_free_md_mempools(struct dm_md_mempools *pools);

 /*
@@ -235,4 +237,8 @@ static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen
 	return !maxlen || strlen(result) + 1 >= maxlen;
 }

+ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf);
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
+						     const char *buf, size_t count);
+
 #endif
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -605,9 +605,4 @@ static inline unsigned long to_bytes(sector_t n)
 	return (n << SECTOR_SHIFT);
 }

-/*-----------------------------------------------------------------
- * Helper for block layer and dm core operations
- *---------------------------------------------------------------*/
-int dm_underlying_device_busy(struct request_queue *q);
-
 #endif	/* _LINUX_DEVICE_MAPPER_H */
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -92,6 +92,6 @@ extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
 extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list);
 extern void mark_mounts_for_expiry(struct list_head *mounts);

-extern dev_t name_to_dev_t(char *name);
+extern dev_t name_to_dev_t(const char *name);

 #endif /* _LINUX_MOUNT_H */
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -267,9 +267,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)

 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	30
+#define DM_VERSION_MINOR	31
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2014-12-22)"
+#define DM_VERSION_EXTRA	"-ioctl (2015-3-12)"

 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */

--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -207,7 +207,7 @@ static dev_t devt_from_partuuid(const char *uuid_str)
 *	bangs.
 */

-dev_t name_to_dev_t(char *name)
+dev_t name_to_dev_t(const char *name)
 {
 	char s[32];
 	char *p;
@@ -226,8 +226,9 @@ dev_t name_to_dev_t(char *name)

 	if (strncmp(name, "/dev/", 5) != 0) {
 		unsigned maj, min;
+		char dummy;

-		if (sscanf(name, "%u:%u", &maj, &min) == 2) {
+		if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2) {
 			res = MKDEV(maj, min);
 			if (maj != MAJOR(res) || min != MINOR(res))
 				goto fail;
@@ -286,6 +287,7 @@ dev_t name_to_dev_t(char *name)
 done:
 	return res;
 }
+EXPORT_SYMBOL_GPL(name_to_dev_t);

 static int __init root_dev_setup(char *line)
 {